Source code for tesliper.extraction.parser_base

"""This module contains a definition of Abstract Base Class for file parsers."""
import re
from abc import ABC, abstractmethod
from typing import Callable, Dict, Iterable, Tuple, Type, Union

from ..exceptions import InvalidStateError

_PARSERS: Dict[str, Type["ParserBase"]] = {}


[docs]class ParserBase(ABC): """Abstract Base Class for parsers implemented as finite state machines. This base class defines some methods to organize work parsers implemented as finite state machines: automates registration of methods and functions as parser's states, manages its execution, and registers derived class as parser used for certain type of files (which registry is used by :class:`.Soxhlet` object). The default parsing flow goes as follow: 1. method :meth:`.parse` is called with file handle as argument; 2. method :meth:`.initial` is set as a 'workhorse' 3. 'workhorse' is called for consecutive lines in file handle 4. :meth:`.initial` checks if any registered trigger matches current line 5. :meth:`.workhorse` is changed to method associated with first matching trigger 6. calling 'workhorse' on consecutive lines continues 7. :meth:`.parse` returns dictionary with extracted values To make this possible, each method marked as state should return dictionary (or sequence convertible to dict) and handle changing 'workhorse' to next appropriate state. To mark a method as parser's state use ParserBase.state decorator in class definition or add a state directly to parser instance using 'add_state' method. When subclassing ParserBase, one should implement :meth:`.initial` and :meth:`.parse` methods. Those abstract methods implement basic functionality, described above. See methods' documentation for more details. If you wish not to use default ParserBase's protocol, simply override those methods to your liking. Values for class attributes :attr:`.extensions` and :attr:`.purpose` should also be provided. To register class derived from ParserBase for use by :class:`.Soxhlet` object, simply set :attr:`.purpose` class attribute to name, under which class should be registered. Setting it to one of names already defined (e.g. 'gaussian') will override the default parser used by :class:`.Soxhlet` object. Attributes ---------- states: dict Dictionary of parser states, created automatically on object instantiation from object methods marked as states; method name is used as a key by default. triggers: dict Dictionary of triggers for parser states, created automatically on object instantiation from object methods marked as states with triggers; key for a particular state trigger should be the same as state's key in :attr:`.states` dictionary. """ @property @classmethod @abstractmethod def extensions(cls) -> Tuple[str]: return tuple() extensions.__doc__ = """File extensions that should be cosidered compatible with a parser subclassing :class:`.ParserBase`. It will be used by :class:`.Soxhlet` to identify which files to parse when reading files in batch. Should be a class attribute with a tuple of str, where each element is a file extension. May also be an empty tuple, if files discovery feature is not needed for the parser. """ @property @classmethod @abstractmethod def purpose(cls) -> str: return "" purpose.__doc__ = """An identifier for a parser subclassing :class:`.ParserBase`. It allows ``tesliper`` to pick a correct parser for each parsing task. A falsy value, i.e. an empty string or ``None`` prevents the parser from beeing registered for use by ``tesliper``. If custom subclass uses a *purpose* already known, e.g. "gaussian" or "spectra", it will override the original parser for this purpose. """ def __init__(self): self.states = {} self.triggers = {} states = ( (name, method) for name, method in ( (n, getattr(self, n)) for n in dir(self) if n != "workhorse" ) if hasattr(method, "is_state") and method.is_state ) for name, method in states: self.add_state(method, name, getattr(method, "trigger", None)) self.workhorse = self.initial def __init_subclass__(cls, **kwargs): global _PARSERS if cls.purpose is ParserBase.purpose: raise TypeError( f"`{cls.__name__}` must provide `purpose` class attribute. " "It may be an empty string if this class should not be registered." ) if cls.extensions is ParserBase.extensions: raise TypeError( f"`{cls.__name__}` must provide `extensions` class attribute. " "It may be an empty tuple if no extensions should be associated with " "this class." ) if cls.purpose: _PARSERS[cls.purpose] = cls if not hasattr(cls.initial, "is_state"): cls.initial.is_state = True @property def workhorse(self) -> Callable: """Callable marked as a current state used by parser object. Setter can take a callable or a string as a parameter. If name as string is passed to setter, it will be translated to a method registered as state. If no method was registered under this name, :class:`.InvalidStateError` will be raised. No other checks are performed when argument is callable.""" return self._workhorse @workhorse.setter def workhorse(self, state: Union[Callable, str]): if callable(state): self._workhorse = state else: try: self._workhorse = self.states[state] except KeyError: raise InvalidStateError( f"{state} is not callable nor registered state name" )
[docs] def add_state(self, state: Callable, name: str = "", trigger: str = ""): """Register callable as parser's state. This method registers a callable under *name* key in :attr:`.states` dictionary. If *trigger* parameter is given, it is registered under the same key in :attr:`.triggers` dictionary. Parameters ---------- state: Callable callable, that is to be registered as parser's state name: str, optional name under which the callable should be registered; defaults to callable.__name__ trigger: str, optional string with regular expression, that will be compiled with re module Returns ------- Callable callable object registered as state """ if not name: name = state.__name__ self.states[name] = state if trigger: self.triggers[name] = re.compile(trigger) elif hasattr(state, "trigger"): self.triggers[name] = re.compile(state.trigger) return state
[docs] def remove_state(self, name: str): """Removes the state from parser's registered states. Parameters ---------- name : str name of state, that should be unregistered Raises ------ InvalidStateError if no callable was registered under the name 'name' """ if name not in self.states: raise InvalidStateError(f"No state registered under name: {name}.") del self.states[name] if name in self.triggers: del self.triggers[name]
[docs] @abstractmethod def initial(self, line: str) -> dict: """An initial parser state. A default implementation checks if any of defined triggers matches a line and sets an associated state as parser's workhorse, if it does. This is an abstract method and should be overridden in subclass. Its default implementation can be used, however, by calling ``super().initial(line)`` in subclass's method. Notes ----- :meth:`.initial` method is always registered as parser's state. Parameters ---------- line: str currently parsed line Returns ------- dict empty dictionary""" for name, reg in self.triggers.items(): match = reg.match(line) if match: self.workhorse = self.states[name] return {} return {}
[docs] @abstractmethod def parse(self, lines: Iterable) -> dict: """Parses consecutive elements of iterable and returns data found as dictionary. Dictionary with extracted data is updated with workhorse's return value, so all states should return dictionary or compatible sequence. This is an abstract method and should be overridden in subclass. Its default implementation can be used, however, by calling ``data = super().parse(lines)`` in subclass's method. Notes ----- After execution - either successful or interrupted by exception - :attr:`.workhorse` is set back to :meth:`.initial` method. Parameters ---------- lines: Iterable iterable (i.e. file handle), that will be parsed, line by line Returns ------- dict dictionary with data extracted by parser Raises ------ InvalidStateError if dictionary can't be updated with state's return value""" data = {} try: for line in lines: output = self.workhorse(line) try: data.update(output) except TypeError as error: raise InvalidStateError( f"State {self.workhorse} should return value " f'convertible to a dictionary, not "{type(output)}".' ) from error except ValueError as error: raise InvalidStateError( f"Value returned by state {self.workhorse} could not " f"be converted to dictionary." ) from error except Exception: raise finally: self.workhorse = self.initial return data
[docs] @staticmethod def state(state=None, trigger=None): """Convenience decorator for registering a method as parser's state. It can be with or without 'trigger' parameter, like this: >>> @ParserBase.state ... def method(self, arg): pass or >>> @ParserBase.state(trigger='triggering regex') ... def method(self, arg): pass This function marks a method *state* as parser's state by defining ``is_state`` attribute on said method and setting its values to ``True``. If *trigger* is given, it is stored in method's attribute *trigger*. During instantiation of :class:`.ParserBase`'s subclass, methods marked as states are registered under ``method.__name__`` key in its :attr:`.states` (and possibly :attr:`.triggers`) attribute. It is meaningless if used outside of :class:`.ParserBase`'s subclass definition. Parameters ---------- state: Callable callable, that is to be registered as parser's state trigger: str, optional string with regular expression, that will be compiled with re module Returns ------- Callable callable object registered as state if 'state' was given or decorator if only 'trigger' was given Raises ------ TypeError if no arguments given InvalidStateError if *state* argument is not callable """ if callable(state): state.is_state = True if trigger: state.trigger = trigger return state elif state is None and trigger is None: raise TypeError("At least one argument must be given.") elif trigger is None: raise InvalidStateError( f"'state' argument should be callable, not {type(state)}" ) else: return lambda s, t=trigger: ParserBase.state(s, t)