Source code for glyles.converter

import os
import sys
import logging
from typing import Union

from joblib import Parallel, delayed, parallel_backend

from glyles.glycans.poly.gwb_glycan import GWBGlycan
from glyles.glycans.poly.glycan import Glycan
from glyles.glycans.utils import ParseError


def preprocess_glycans(glycan, glycan_list, glycan_file):
    """
    Preprocess the static inputs for the parsing into one single list

    Args:
        glycan (str): single glycan to parse
        glycan_list (List[str]): list of glycans to parse
        glycan_file (str): filepath of file to read glycans from

    Returns:
        list of glycans in the order they are handed to the function, i.e. glycan, glycan_list, glycan_file
    """
    glycans = []

    # fill a list with all glycans to convert
    if glycan is not None:
        glycans.append(glycan)
    if glycan_list is not None:
        glycans += glycan_list
    if glycan_file is not None:
        # check if the file is valid and read it out
        if not os.path.isfile(glycan_file):
            raise ValueError(f"{glycan_file} does not exists, cannot read glycans.")
        for line in open(glycan_file, "r").readlines():
            glycans.append(line.strip())
    return glycans


[docs]def convert( glycan=None, glycan_list=None, glycan_file=None, glycan_generator=None, output_file=None, returning=True, verbose: Union[int, None] = logging.WARNING, cpu_count=1, full=True, ): """ Convert glycans of different input formats. All glycans have to be in IUPAC-condensed notation, but how they're organized, can change. Either as single glycan, list or tuples of glycans, a file, or a generator. All will be converted to SMILES. Args: glycan (str): Single glycan to be converted from IUPAC to SMILES glycan_list (List[str]): list of glycans to convert glycan_file (str): File to read the glycans from glycan_generator (generator): generator yielding iupac representation. Together with output_generator=True this does not create any lists output_file (str): File to save the converted glycans in returning (bool): Flag indicating to return a list of tuples verbose (Union[int, None]): Flag indicating to have no prints from this method cpu_count (int): Number of CPU cores to use for parallel processing. Behavior as described in joblib: https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html full (bool): Flag indicating that only fully convertible glycans should be returned, i.e. all modifications such as 3-Anhydro-[...] are also present in the SMILES Returns: List of type (IUPAC, SMILES) items giving the converted SMILES formulas. Only if returning=True is set. """ if verbose is None: verbose = logging.CRITICAL logging.basicConfig(level=verbose) # collect all data and return if no data were provided glycans = preprocess_glycans(glycan, glycan_list, glycan_file) if len(glycans) == 0 and glycan_generator is None: logging.info("List of glycans is empty") return # determine the output format if output_file is not None: if os.path.isdir(os.path.dirname(os.path.abspath(output_file))): output = open(output_file, "w") else: logging.warning("Path of output-file does not exist! Results will be printed on stdout.") output = sys.stdout returning = False else: if returning: output = [] else: logging.warning("No output-file specified, results will be printed on stdout.") output = sys.stdout # convert the IUPAC strings into SMILES strings from the input list and from the input generator container = [] if len(glycans) != 0: container.append(glycans) if glycan_generator is not None: container.append(glycan_generator) for container in container: with parallel_backend('multiprocessing', n_jobs=cpu_count): results = Parallel()(delayed(generate)(iupac, full) for iupac in container) if returning: output += results else: for iupac, smiles in results: print(iupac, smiles, file=output, sep=",") if returning: return output elif output_file is not None: output.close() if verbose is None: logger.disabled = False
[docs]def convert_generator( glycan=None, glycan_list=None, glycan_file=None, glycan_generator=None, verbose=logging.INFO, cpu_count=1, full=True, ): """ Convert glycans of different input formats. All glycans have to be in IUPAC-condensed notation, but how they're organized, can change. Either as single glycan, list or tuples of glycans, a file, or a generator. All will be converted to SMILES, output by a generator. Args: glycan (str): Single glycan to be converted from IUPAC to SMILES glycan_list (List[str]): list of glycans to convert glycan_file (str): File to read the glycans from glycan_generator (generator): generator yielding iupac representation. Together with output_generator=True this does not create any lists verbose (Union[int, None]): Flag indicating to have no output-messages from this method cpu_count (int): Number of CPU cores to use for parallel processing. Behavior as described in joblib: https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html full (bool): Flag indicating that only fully convertible glycans should be returned, i.e. all modifications such as 3-Anhydro-[...] are also present in the SMILES Returns: Generator generating pairs of type (IUPAC, SMILES) items giving the converted SMILES formulas for the IUPACs. """ if verbose is None: logger = logging.getLogger() logger.disabled = True else: logging.basicConfig(level=verbose) glycans = preprocess_glycans(glycan, glycan_list, glycan_file) if len(glycans) == 0 and glycan_generator is None: logging.info("List of glycans is empty") return # Convert the glycans ... if len(glycans) != 0: for glycan in glycans: yield generate(glycan, full) # Convert the glycans ... if glycan_generator is not None: for glycan in glycan_generator: yield generate(glycan, full) if verbose is None: logger.disabled = False
def generate(glycan, full): """ Actually generate the SMILES string based on the glycan given in IUPAC notation. Parameters: glycan (str): Glycan molecule described by its IUPAC string full (bool): flag indicating to only return SMILES string that include all modifications from the IUPAC Returns: A pair of glycan represented with its IUPAC string and SMILES string """ try: # ... by passing them to the glycan class to parse them and return them as intended if "End--" in glycan: return GWBGlycan(glycan, full=full).get_smiles() return glycan, Glycan(glycan, full=full).get_smiles() # catch any exception at glycan level to not destroy the whole pipeline because of one mis-formed glycan except ParseError as e: logging.error(f"A parsing error occurred with {glycan}: {e.__class__}\n" f"Error message: {e.__str__()}") return glycan, "" except Exception as e: logging.error(f"An unexpected exception occurred with with {glycan}. This glycan cannot be parsed. " f"Error message: {e.__str__()}") return glycan, "" if __name__ == "__main__": # print(convert("GalNAc")) print(Glycan("GalNAc").to_iupac())