Source code for epsman.elecStructure.util

"""
ePSman electronic structure utility functions
--------------------------------

Utility functions for use with ePSman.

24/01/24   v1

See also ../_util.py for general utility functions

"""

from epsman._util import fileParse
import pandas as pd


[docs]def readMoldenGeoms(fileName, keyWord='[Atoms]', 
                    skiprowsOffset = 0, nrowType='auto',
                    widths = [2,7,7,20,20,20],
                    names= ['Ind','Species','Atomic Num.','x','y','z'],
                    verbose = 0):
    """
    Read geometry/ies in Molden format.
    
    Parameters
    ----------
    fileName : str or Path object
        File to read.
        
    keyWord : str, default = '[Atoms]'
        Keyword to search for in file.
        Default case flags Molden geometry sections.
        If multiple sections are found they will all be read in.
        
    skiprowsOffset : int, default = 0
        Additional header rows to skip.
        Default case 
    
    nrowType : str or int, default = 'auto'
        If 'auto' tries to read all lines until next section.
        If an int, read this number of lines after keyWord.
        
    widths : list, default = [2,7,7,20,20,20]
        Column widths, passed to pd.read_fwf() for reading sections.
        Default case for normal Molden format.
    
    names : list, default = ['Ind','Species','Atomic Num.','x','y','z']
        Columns names.
        Default case matches epsman.elecStructure.gamess.getAtoms()  
        
    Returns
    -------
    dict
        Contains geometries keyed by integer, as Pandas DataFrames.
        Also key 'details' for metadata.
        
        
    Notes
    -----
    
    - RDkit doesn't read Molden format.
    - RDkit does support a range of other file types, [x,y,z] might be sufficient? https://www.rdkit.org/docs/source/rdkit.Chem.rdmolfiles.html#rdkit.Chem.rdmolfiles.MolFromXYZFile
    - RDkit does support Pandas, but only for table of molecules output? E.g. https://xinhaoli74.github.io/blog/rdkit/2021/01/06/rdkit.html#PandasTools, http://rdkit.org/docs/source/rdkit.Chem.PandasTools.html
    - CCLIB writes, but doesn't read, Molden format. Could still be used for conversion to SDF for RDkit...?
        
    """

    print(f"*** Reading geometries from file {fileName}...")
    
    # lineNumbers, fileSegs = IO.fileParse(fileName, startPhrase=keyWord, verbose=1)  # With no endPhrase, this gets correct line-list, but pulls everything as a single segment
    lineNumbers, fileSegs = fileParse(fileName, startPhrase=keyWord, endPhrase=keyWord, verbose=verbose)  # With endPhrase this gets correct line-list, but segments are blank.
    
    # Read with pandas?
    geomDict = {}
    
    for n,item in enumerate(lineNumbers[0]):

        if verbose > 1:
            print(f"Reading {n}, lines: {item}")
        
        if nrowType == 'auto':
            try:
                nrows = (lineNumbers[0][n+1]-3+skiprowsOffset) - (lineNumbers[0][n]-1+skiprowsOffset)

            except IndexError:
                nrows = None  # For final item set None to read to end of file.
        else:
            nrows = nrowType
            
        
        geomDict[n] = {'pd':pd.read_fwf(fileName, skiprows=lineNumbers[0][n] + skiprowsOffset, nrows=nrows, 
                                  header=None, names= ['Species','Ind','Atomic Num.','x','y','z'],  # Col names to match  epsman.elecStructure.gamess.getAtoms()       # ['index','mass','x','y','z'], 
                                  widths = [2,7,7,20,20,20])  #, index_col=0)  # Set index col if required
                      }

#         geomDict[n]['pd'].index.rename('atom',inplace=True)

        # Redored index?
        geomDict[n]['pd'] = geomDict[n]['pd'].iloc[:, [1, 0, 2, 3, 4, 5]]
        
        # Reindex to 0-offset if required
        if geomDict[n]['pd']['Ind'].min() == 1:
            geomDict[n]['pd']['Ind'] = geomDict[n]['pd']['Ind'] -1
    
        # Set positions-only dictionary form for use with existing `setCoords()` method
        geomTemp = geomDict[n]['pd'].to_dict()
        geomDict[n]['positionsDict'] = {k:[geomTemp['x'][k], geomTemp['y'][k], geomTemp['z'][k]] for k in geomTemp['Ind'].keys()}
        
    
    geomDict['details'] = { 'file': fileName,
                            'lineNumbers': lineNumbers,
                            'key word matches': len(lineNumbers[0]),
                            'geoms':len(geomDict)
                          }
    
    print(f"Read {geomDict['details']['geoms']} geometries.")
    
    return geomDict