Source code for tesliper.extraction.gaussian_parser

"""Paser for Gaussian output files."""
import logging as lgg
import re

from .parser_base import ParserBase

# LOGGER
logger = lgg.getLogger(__name__)
logger.setLevel(lgg.INFO)

# REGEXS
number_group = r"\s*(-?\d+\.?\d*)"
number = number_group.replace("(", "").replace(")", "")

TERMINATION_CRE = re.compile(r"Normal termination.*\n\Z")
SCF_CRE = re.compile(r"SCF Done.*?=" + number_group)  # use .search(line).group(1)
STOICH_CRE = re.compile(r"^ Stoichiometry\s*(\w*(?:\(\d+[+-]?,?\d*\))?)\n")

GEOM_LINE_CRE = re.compile(r"^\s*(\d+)\s+(\d+)\s+(\d+)" + 3 * number_group)

# VIBRATIONAL
vibr_genre = {
    "Frequencies": "freq",
    "Red. masses": "mass",
    "Frc consts": "frc",
    "IR Inten": "iri",
    "Dip. str.": "dip",
    "Rot. str.": "rot",
    "E-M angle": "emang",
    "Depolar (P)": "depolarp",
    "Depolar (U)": "depolaru",
    "RamAct": "ramact",
    "Raman Activ": "ramanactiv",
    "Dep-P": "depp",
    "Dep-U": "depu",
    "Alpha2": "alpha2",
    "Beta2": "beta2",
    "AlphaG": "alphag",
    "Gamma2": "gamma2",
    "Delta2": "delta2",
    "Raman1": "raman1",
    "ROA1": "roa1",
    "CID1": "cid1",
    "Raman2": "raman2",
    "ROA2": "roa2",
    "CID2": "cid2",
    "Raman3": "raman3",
    "ROA3": "roa3",
    "CID3": "cid3",
    "RC180": "rc180",
}

VIBRATIONAL_CRE = re.compile(
    r"^\s\s?([a-zA-Z.\-]+[0-9]*(?:\s?[a-zA-Z.()]+)?)\s*(?:(?:Fr= \d+)?--)"
    + 3 * number_group
)

# ELECTRONIC
EXCITED_CRE = re.compile(
    r"^ Excited State\s+\d+:[a-zA-Z\-\s\'\"]*(-?\d+\.?\d*) eV\s+(-?\d+\.?\d*) nm"
)
TRANSITIONS_CRE = re.compile(r"(\d+)\s*->\s*(\d+)\s+(-?\d+\.\d+)")
numbers = 4 * number + 2 * number_group + r"?\n"
NUMBERS_CRE = re.compile(numbers)


# CLASSES
[docs]class GaussianParser(ParserBase):
    """Parser for extracting data from human-readable output files from Gaussian
    computational chemistry software (.log and .out files).

    This class implements methods for reading information about conducted calculations'
    parameters, molecule energy, structure optimization, and calculation of spectral
    properties. It's use is as straightforward as:

    >>> parser = GaussianParser()
    >>> with open('path/to/file.out') as file:
    >>>     data = parser.parse(file)

    Dictionary with data extracted is also stored as :attr:`.data` attribute of instance
    used for parsing. Each key in said dictionary is a name of its value data type,
    called from now on a 'data genre' (to avoid confusion with Python's data type).
    Below is a full list of data genres recognized by this parser, with their
    description:

    freq : list of floats, available from freq job
        harmonic vibrational frequencies (cm^-1)
    mass : list of floats, available from freq job
        reduced masses (AMU)
    frc : list of floats, available from freq job
        force constants (mDyne/A)
    iri : list of floats, available from freq job
        IR intensities (KM/mole)
    dip : list of floats, available from freq=VCD job
        dipole strengths (10**-40 esu**2-cm**2)
    rot : list of floats, available from freq=VCD job
        rotational strengths (10**-44 esu**2-cm**2)
    emang : list of floats, available from freq=VCD job
        E-M angle = Angle between electric and magnetic dipole transition moments (deg)
    depolarp : list of floats, available from freq=Raman job
        depolarization ratios for plane incident light
    depolaru : list of floats, available from freq=Raman job
        depolarization ratios for unpolarized incident light
    ramanactiv : list of floats, available from freq=Raman job
        Raman scattering activities (A**4/AMU)
    ramact : list of floats, available from freq=ROA job
        Raman scattering activities (A**4/AMU)
    depp : list of floats, available from freq=ROA job
        depolarization ratios for plane incident light
    depu : list of floats, available from freq=ROA job
        depolarization ratios for unpolarized incident light
    alpha2 : list of floats, available from freq=ROA job
        Raman invariants Alpha2 = alpha**2 (A**4/AMU)
    beta2 : list of floats, available from freq=ROA job
        Raman invariants Beta2 = beta(alpha)**2 (A**4/AMU)
    alphag : list of floats, available from freq=ROA job
        ROA invariants AlphaG = alphaG'(10**4 A**5/AMU)
    gamma2 : list of floats, available from freq=ROA job
        ROA invariants Gamma2 = beta(G')**2 (10**4 A**5/AMU)
    delta2 : list of floats, available from freq=ROA job
        ROA invariants Delta2 = beta(A)**2, (10**4 A**5/AMU)
    raman1 : list of floats, available from freq=ROA job
        Far-From-Resonance Raman intensities =ICPu/SCPu(180) (K)
    roa1 : list of floats, available from freq=ROA job
        ROA intensities =ICPu/SCPu(180) (10**4 K)
    cid1 : list of floats, available from freq=ROA job
        CID=(ROA/Raman)*10**4 =ICPu/SCPu(180)
    raman2 : list of floats, available from freq=ROA job
        Far-From-Resonance Raman intensities =ICPd/SCPd(90) (K)
    roa2 : list of floats, available from freq=ROA job
        ROA intensities =ICPd/SCPd(90) (10**4 K)
    cid2 : list of floats, available from freq=ROA job
        CID=(ROA/Raman)*10**4 =ICPd/SCPd(90)
    raman3 : list of floats, available from freq=ROA job
        Far-From-Resonance Raman intensities =DCPI(180) (K)
    roa3 : list of floats, available from freq=ROA job
        ROA intensities =DCPI(180) (10**4 K)
    cid3 : list of floats, available from freq=ROA job
        CID=(ROA/Raman)*10**4 =DCPI(180)
    rc180 : list of floats, available from freq=ROA job
        RC180 = degree of circularity
    wavelen : list of floats, available from td job
        excitation energies (nm)
    ex_en : list of floats, available from td job
        excitation energies (eV)
    eemang : list of floats, available from td job
        E-M angle = Angle between electric and magnetic dipole transition moments (deg)
    vdip : list of floats, available from td job
        dipole strengths (velocity)
    ldip : list of floats, available from td job
        dipole strengths (length)
    vrot : list of floats, available from td job
        rotatory strengths (velocity) in cgs (10**-40 erg-esu-cm/Gauss)
    lrot : list of floats, available from td job
        rotatory strengths (length) in cgs (10**-40 erg-esu-cm/Gauss)
    vosc : list of floats, available from td job
        oscillator strengths
    losc : list of floats, available from td job
        oscillator strengths
    transitions : list of lists of lists of (int, int, float), available from td job
        transitions (first to second) and their coefficients (third)
    scf : float, always available
        SCF energy
    zpe : float, available from freq job
        Sum of electronic and zero-point Energies (Hartree/Particle)
    ten : float, available from freq job
        Sum of electronic and thermal Energies (Hartree/Particle)
    ent : float, available from freq job
        Sum of electronic and thermal Enthalpies (Hartree/Particle)
    gib : float, available from freq job
        Sum of electronic and thermal Free Energies (Hartree/Particle)
    zpecorr : float, available from freq job
        Zero-point correction (Hartree/Particle)
    tencorr : float, available from freq job
        Thermal correction to Energy (Hartree/Particle)
    entcorr : float, available from freq job
        Thermal correction to Enthalpy (Hartree/Particle)
    gibcorr : float, available from freq job
        Thermal correction to Gibbs Free Energy (Hartree/Particle)
    command : str, always available
        command used for calculations
    normal_termination : bool, always available
        true if Gaussian job seem to exit normally, false otherwise
    optimization_completed : bool, available from opt job
        true if structure optimization was performed successfully
    version : str, always available
        version of Gaussian software used
    charge : int, always available
        molecule's charge
    multiplicity : int, always available
        molecule's spin multiplicity
    input_atoms : list of str, always available
        input atoms as a list of atoms' symbols
    input_geom : list of lists of floats, always available
        input geometry as X, Y, Z coordinates of atoms
    stoichiometry : str, always available
        molecule's stoichiometry
    last_read_atoms : list of ints, always available
        molecule's atoms as atomic numbers
    last_read_geom : list of lists of floats, always available
        molecule's geometry (last one found in file) as X, Y, Z coordinates of atoms
    optimized_atoms : list of ints, available from successful opt job
        molecule's atoms read from optimized geometry as atomic numbers
    optimized_geom : list of lists of floats, available from successful opt job
        optimized geometry as X, Y, Z coordinates of atoms

    Attributes
    ----------
    data : dict
        Data extracted during last parsing."""

    purpose = "gaussian"
    extensions = ("log", "out")

    def __init__(self):
        super().__init__()
        self._iterator = iter([])
        self.data = {}

[docs]    def parse(self, lines) -> dict:
        """Parses content of Gaussian output file and returns dictionary of found data.

        Parameters
        ----------
        lines : iterator
            Gaussian output file in a form iterable by lines of text. It may be a file
            handle, a list of strings, an io.StringIO instance, or similar. Please note
            that it should not be just a string instance, as it is normally iterated
            by a character, not by a line.

        Returns
        -------
        dict
            Dictionary of extracted data."""
        # TODO: supplement docstring
        self.workhorse = self.initial
        self.data = {}
        self._iterator = iter(lines)
        try:
            for line in self._iterator:
                self.workhorse(line)
        except StopIteration:
            filename = getattr(self._iterator, "name", None)
            filename = f'"{filename}" file' if filename else None
            logger.debug(
                f"Unexpected stop of iteration occurred while parsing "
                f"{filename or self._iterator}. Job didn't finish or content may be "
                f"corrupted."
            )
            self.data["normal_termination"] = False
        return self.data

[docs]    @ParserBase.state
    def initial(self, line: str) -> None:
        """First step of parsing Gaussian output file. It populates parser.data
        dictionary with these data genes: 'normal_termination', 'version', 'command',
        'charge', 'multiplicity', 'input_geom'. Optionally, 'optimization_completed'
        genre is added if optimization was requested in calculation job.

        Parameters
        ----------
        line : str
            Line of text to parse."""
        data, iterator = self.data, self._iterator
        # assume error termination until output file says otherwise
        data["normal_termination"] = False
        while not line == " Cite this work as:\n":
            line = next(iterator)
        data["version"] = next(iterator).strip(" \n,")
        while not line.startswith(" #"):
            line = next(iterator)
        command = []
        while not line.startswith(" --"):
            command.append(line.strip("# \n"))
            line = next(iterator)
        command = data["command"] = " ".join(command)
        if "opt" in command:
            data["optimization_completed"] = False
        while not line == " Symbolic Z-matrix:\n":
            line = next(iterator)
        c_and_m = re.match(r" Charge =\s*(-?\d) Multiplicity = (\d)", next(iterator))
        data["charge"], data["multiplicity"] = map(int, c_and_m.groups())
        line = next(iterator).strip()
        input_geom = []
        pattern = r"(\w+)" + 3 * number_group
        while line:  # reads symbolic Z-matrix
            atom = re.match(pattern, line)
            label, *coordinates = atom.groups()
            input_geom.append([label, list(map(float, coordinates))])
            line = next(iterator).strip()
        data["input_atoms"], data["input_geom"] = map(list, zip(*input_geom))
        self.workhorse = self.wait

[docs]    @ParserBase.state
    def wait(self, line: str) -> None:
        """This function searches for lines of text triggering other parsing states.
        It also updates a parser.data dictionary with 'normal_termination', 'scf',
        'stoichiometry' data genres.

        Parameters
        ----------
        line : str
            Line of text to parse."""
        for name, reg in self.triggers.items():
            match = reg.match(line)
            if match:
                self.workhorse = name
                return
        if "Error termination" in line:
            self.data["normal_termination"] = False
        elif "Normal termination" in line:
            self.data["normal_termination"] = True
        elif line.startswith(" SCF Done:"):
            self.data["scf"] = float(SCF_CRE.search(line).group(1))
        elif line.startswith(" Stoichiometry"):
            self.data["stoichiometry"] = STOICH_CRE.match(line).group(1)
        elif "Proceeding to internal job step number" in line:
            # last job determines if termination was normal
            self.data["normal_termination"] = False

[docs]    @ParserBase.state(trigger=re.compile(r"^\s+Standard orientation"))
    def geometry(self, line: str) -> None:
        """Function for extracting information about molecule standard orientation
        geometry from Gaussian output files. It updates parser.data dictionary with
        'last_read_atoms' and 'last_read_geom' data genres.

        Parameters
        ----------
        line : str
            Line of text to parse."""
        data, iterator = self.data, self._iterator
        match = GEOM_LINE_CRE.match(line)
        while not match:
            line = next(iterator)
            match = GEOM_LINE_CRE.match(line)
        geom = []
        while match:
            geom.append(match.groups())
            line = next(iterator)
            match = GEOM_LINE_CRE.match(line)
        geom = ((int(a), [float(x), float(y), float(z)]) for _, a, _, x, y, z in geom)
        # produce list and list of lists instead of tuple and tuple of lists
        data["last_read_atoms"], data["last_read_geom"] = map(list, zip(*geom))
        self.workhorse = self.wait

[docs]    @ParserBase.state(trigger=re.compile("^ Search for a local minimum."))
    def optimization(self, line: str) -> None:
        """This method scans optimization data in Gaussian output file, updating
        parser.data dictionary with 'stoichiometry', 'scf', 'optimization_completed',
        'optimized_atoms', and 'optimized_geom' data genres (last two via
        :meth:`.geometry` method).

        Parameters
        ----------
        line : str
            Line of text to parse.
        """
        # TODO: modify this to only run if optimization requested
        if self.triggers["geometry"].match(line):
            self.geometry(line)
            self.workhorse = self.optimization
        elif line.startswith(" Stoichiometry"):
            self.data["stoichiometry"] = STOICH_CRE.match(line).group(1)
        elif line.startswith(" SCF Done:"):
            self.data["scf"] = float(SCF_CRE.search(line).group(1))
        elif line.startswith(" Optimization completed."):
            self.data["optimization_completed"] = True
            if "optimized_geom" not in self.data:
                self.data["optimized_geom"] = self.data["last_read_geom"]
                self.data["optimized_atoms"] = self.data["last_read_atoms"]
        elif line.startswith(" Error termination"):
            self.data["normal_termination"] = False
        if line.startswith((" Error termination", " Job cpu time")):
            self.workhorse = self.wait

[docs]    @ParserBase.state(trigger=re.compile("^ Harmonic frequencies"))
    def frequencies(self, line: str) -> None:
        """Responsible for extracting harmonic vibrations-related data and information
        about molecule's energy.

        Parameters
        ----------
        line : str
            Line of text to parse."""
        data, iterator = self.data, self._iterator
        while not line == "\n":
            # while frequencies section is not over
            match = VIBRATIONAL_CRE.match(line)
            while match:
                # unpack values from current line to list of corresponding genre
                name, *values = match.groups()
                genre = vibr_genre[name]  # convert gaussian line name to genre
                data.setdefault(genre, list()).extend(float(x) for x in values)
                line = next(iterator)
                match = VIBRATIONAL_CRE.match(line)
            line = next(iterator)
        while not line.startswith(" Zero-point correction="):
            line = next(iterator)
        # parse energies values
        for genre in "zpecorr tencorr entcorr gibcorr zpe ten ent gib".split():
            data[genre] = float(re.search(number, line).group())
            line = next(iterator)
        self.workhorse = self.wait

    def _excited_states(self, line: str) -> list:
        """Helper function for extracting electronic transitions-related data
        from content of Gaussian output file.

        Parameters
        ----------
        line : str
            Line of text to parse.

        Returns
        -------
        list
            List of floats with data extracted from input string."""
        iterator = self._iterator
        while not line.startswith(" Excited State"):
            line = next(iterator)
        # map obj unpacked in calling method
        out = [map(float, EXCITED_CRE.match(line).groups())]
        while line.strip():
            line = next(iterator)
            match = TRANSITIONS_CRE.search(line)
            if match:
                out.append(match.groups())
        return out

[docs]    @ParserBase.state(trigger=re.compile("^ Excited states from"))
    def excited(self, line: str) -> None:
        """Responsible for extracting electronic transitions-related data from Gaussian
        output file. Updates parser.data dictionary with 'ldip', 'losc', 'vdip', 'vosc',
        'vrot', 'eemang', 'lrot', 'wavelen', 'ex_en', and 'transitions' data genres.

        Parameters
        ----------
        line : str
            Line of text to parse."""
        data, iterator = self.data, self._iterator
        for genres, header in (
            (("ldip", "losc"), "electric dipole"),
            (("vdip", "vosc"), "velocity dipole"),
            (("vrot", "eemang"), "Rotatory Strengths"),
            (("lrot", ""), "Rotatory Strengths"),
        ):
            while header not in line:
                line = next(iterator)
            next(iterator)  # skip column names
            match = NUMBERS_CRE.search(next(iterator))
            values = []
            while match:
                values.append(float(x) if x else None for x in match.groups())
                line = next(iterator)
                match = NUMBERS_CRE.search(line)
            for genre, values in zip(genres, zip(*values)):
                if genre:
                    data[genre] = values
        while not line.startswith(" **"):
            (energy, wavelen), *transitions = self._excited_states(line)
            data.setdefault("wavelen", []).append(wavelen)
            data.setdefault("ex_en", []).append(energy)
            data.setdefault("transitions", []).append(
                [[int(low), int(high), float(coef)] for low, high, coef in transitions]
            )
            line = next(iterator)
        self.workhorse = self.wait