Source code for glyles.glycans.poly.glycan

import logging
from collections import Counter
import re

import networkx as nx
from glyles.iupac.IUPACParser import IUPACParser
from networkx.algorithms.isomorphism import DiGraphMatcher
import pydot
from antlr4 import InputStream, CommonTokenStream
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt

from glyles.glycans.factory.factory import MonomerFactory
from glyles.glycans.mono.reactor import functional_groups
from glyles.glycans.mono.monomer import Monomer
from glyles.glycans.poly.anltr_error_listener import GlyLESErrorListener
from glyles.glycans.poly.merger import Merger
from glyles.glycans.poly.walker import TreeWalker
from glyles.glycans.utils import ParseError
from glyles.gwb.GWBLexer import GWBLexer
from glyles.iupac.IUPACLexer import IUPACLexer
from glyles.glycans.utils import smiles2mol

if not hasattr(IUPACLexer, "MOD"):
    IUPACLexer.MOD = IUPACLexer.QMARK + 1
    GWBLexer.MOD = GWBLexer.QMARK + 1


def compare_smiles(
        c: Chem.Mol,
        s: Chem.Mol
):
    """
    Compare two molecules if they are equal.

    Args:
        c: molecule 1
        s: molecule 2

    Returns:
        True if their kekulized, canonical SMILES string is equal; False otherwise
    """
    Chem.Kekulize(c)
    Chem.Kekulize(s)

    ssmiles = Chem.MolToSmiles(s, kekuleSmiles=True)
    csmiles = Chem.MolToSmiles(c, kekuleSmiles=True)
    return csmiles == ssmiles


def recipe_equality(
        glycan: Monomer,
        query: Monomer,
        no: bool = False,
        some: bool = False,
        every: bool = False
):
    """
    Checking if two monomers are considered isomorphic under the given mode of isomorphism. One of the bool-flags has
    to be set.

    Note:
        node-match's first argument is always from first node, second argument always from second graph

    Args:
        glycan: monomer to search in
        query: monomer to find in glycan
        no: If True, only match basic monosaccharide, no functional groups
        some: If True, match all of query's groups but not all of glycan's groups
        every: If True, glycans must match exactly, i.e., modifications have to be the same

    Returns:
        True if glycan and query are considered equal under given circumstances
    """
    # check if exactly one isomorphism mode is activated
    if sum([no, some, every]) != 1:
        raise ValueError("Exactly one of arguments no, some, every has to be set to True.")

    if no:
        # if functional groups should not be considered for isomorphism, just check the root monomer of both nodes
        recipe_glycan, recipe_query = glycan.get_recipe(), query.get_recipe()
        return recipe_glycan[list(zip(*recipe_glycan))[1].index(IUPACLexer.SAC)] == \
               recipe_query[list(zip(*recipe_query))[1].index(IUPACLexer.SAC)]
    if some:
        # raise NotImplementedError("Matching to a subset of functional groups is not yet implemented. Coming soon.")
        """# if query's fgs have to be a subset of glycan's fgs, do some magic
        recipe_glycan, recipe_query = glycan.get_recipe(), query.get_recipe()
        if recipe_glycan[list(zip(*recipe_glycan))[1].index(IUPACLexer.SAC)] != \
                recipe_query[list(zip(*recipe_query))[1].index(IUPACLexer.SAC)]:
            return False
        iso = find_isomorphism_nx(glycan, query, query.name, query.c1_find)
        inv = dict([(v, k) for k, v in iso])
        ring_o = query.x[querx.x[:, 1] == 100]
        if ring_o not in inv:
            return False"""
        return len(set(query.get_recipe()).difference(set(glycan.get_recipe()))) == 0
    if every:
        # if all functional groups have to be matched, compare the smiles strings of both
        return compare_smiles(glycan.get_structure(), query.get_structure())
    return False


def prepare(iupac):
    return re.sub(r'(?<!\d)dTal', '6dTal', iupac)


[docs]class Glycan: """ This class is like an interaction with the Parser for the IUPAC representation of the glycan. The grammar for glycans is defined using ANTLR (https://www.antlr.org/). From this ANTLR is able to generate lexer and parser that fit the defined grammar. Don't touch those files those are auto generated and therefore mostly uncommented. The defined grammar discards the last glycan which is used to define the root of the glycan tree. Therefore, the resulting abstract syntax trees (AST)s are not intuitive. """ def __init__(self, iupac, root_orientation="n", start=100, tree_only=False, full=True): """ Initialize the glycan from the IUPAC string. Args: iupac (str): IUPAC string representation of the glycan to represent root_orientation (str): orientation of the root monomer in the glycan (choose from 'a', 'b', 'n') start (int): ID of the atom to start with in the root monomer when generating the SMILES tree_only (int): Flag indicating to only parse the tree of glycans and not the modifications full (bool): Flag indicating that only fully convertible glycans should be returned, i.e. all modifications such as 3-Anhydro-[...] are also present in the SMILES """ self.iupac = iupac self.parse_tree = None self.grammar_tree = None self.glycan_smiles = None self.root_orientation = root_orientation self.start = start self.tree_only = tree_only self.factory = MonomerFactory() self.full = full self.tree_full = True self.parse()
[docs] def summary(self): """ Aggregate some statistics of the glycan. This includes in the following order [the key in the output dictionary in brackets]: molecular formula [formula], number of atoms [atoms], number of bonds [bonds], number of rings [rings], number of monomers [monomers], max depth of the tree [depth], the root monomer [root], list of all leaf monomers [leaves], molecular weight [weight]. Returns: The above named statistics are returned as dictionary with the given keys. """ # generate smiles for this molecule and check it's not empty smiles = self.get_smiles() if smiles == "": raise ValueError("SMILES string for this glycan is empty, check if the IUPAC is convertable.") # generate molecule with RDKit and check it's not None mol = smiles2mol(smiles) if mol is None: raise ValueError("Generated SMILES is invalid, rdkit couldn't read it in.") # compute the statistics and return the results inplace return { "formula": Chem.rdMolDescriptors.CalcMolFormula(mol), "weight": ExactMolWt(mol), "atoms": len(mol.GetAtoms()), "bonds": len(mol.GetBonds()), "rings": len(mol.GetRingInfo().AtomRings()), "monomers": len(self.parse_tree.nodes), "types": dict(Counter([self.parse_tree.nodes[n]["type"].get_name("full") for n in self.parse_tree.nodes])), "root": self.parse_tree.nodes[0]["type"].get_name("full"), "leaves": [self.parse_tree.nodes[n]["type"].get_name("full") for n, d in self.parse_tree.out_degree() if d == 0], "depth": max([v for k, v in nx.shortest_path_length(self.parse_tree, 0).items()]), }
[docs] def count(self, glycan, match_all_fg=False, match_some_fg=False, match_edges=False, match_nodes=False, match_leaves=False, match_root=False): """Match a glycan against a query molecule and return the number of hits. This matching can be restricted by setting some flags introducing additional conditions of the matches. This matching does not include the configuration (alpha/beta/undefined) of the root monomer of the query. So query "Gal" will result a hit in "GalNAc6S b" but neither do "Gal a" or "Gal b". Args: glycan (Union[str, 'Glycan']): query glycan to be matched against the monomers of this glycan match_all_fg (bool): flag indicating to match all fgs of the query glycan to all fgs of a monomer match_some_fg (bool): flag indicating to match all fgs of the query glycan to some fgs of a monomer match_edges (bool): flag indicating to also match edges match_nodes (bool): flag indicating to match against all nodes match_leaves (bool): flag indicating to match against the leaf monomers only match_root (bool): flag indicating to match against the root monomer only Returns: The number of matches of the query in this glycan under the given conditions """ if sum([match_nodes, match_leaves, match_root]) != 1: raise ValueError("Exactly one of match_nodes, match_leaves, match_root has to be True.") if isinstance(glycan, str): glycan = Glycan(glycan) if len(glycan.parse_tree.nodes) != 1 and (match_leaves or match_root): raise ValueError("Cannot match polymeric glycan against leaves of glycan. Leaves are monomers.") # node-match's first argument is always from first node, second argument always from second graph kwargs = { "node_match": lambda x, y: recipe_equality(x["type"], y["type"], no=True), } if match_some_fg: kwargs["node_match"] = lambda x, y: recipe_equality(x["type"], y["type"], some=True) elif match_all_fg: kwargs["node_match"] = lambda x, y: recipe_equality(x["type"], y["type"], every=True) if match_edges: kwargs["edge_match"] = lambda e, f: e["type"] == f["type"] if match_nodes: matcher = DiGraphMatcher(self.parse_tree, glycan.parse_tree, **kwargs) return len(list(matcher.subgraph_isomorphisms_iter())) if match_leaves: q = glycan.parse_tree.nodes[0] return sum( [kwargs["node_match"](self.parse_tree.nodes[n], q) for n, d in self.parse_tree.out_degree() if d == 0]) if match_root: return sum([kwargs["node_match"](self.parse_tree.nodes[0], glycan.parse_tree.nodes[0])])
[docs] def count_protonation(self, grouping): """ Count the possible deprotonation sites in the final molecule. Args: grouping (bool): If True, count functional groups based on their common atom, so an SO2 group will count as 1. Otherwise, count groups based on the protonizable oxygen atoms, so an SO2 group will count as 2. Returns: The number of possible deprotonations in the molecule. """ # generate smiles for this molecule and check it's not empty smiles = self.get_smiles() if smiles == "": raise ValueError("SMILES string for this glycan is empty, check if the IUPAC is convertable.") # generate molecule with RDKit and check it's not None mol = smiles2mol(smiles) if mol is None: raise ValueError("Generated SMILES is invalid, rdkit couldn't read it in.") # iterate over deprotonatable functional groups in every form count = 0 for core, group in [ (6, [("C(O)=O", 1)]), (15, [("P(=O)(O)O", 3), ("P(=O)O", 2), ("P=O", 1)]), (16, [("S(=O)(=O)O", 2), ("S(=O)(=O)", 1)]) ]: matched_atoms = set() for g, val in group: # compute the matches against this glycan matches = mol.GetSubstructMatches(smiles2mol(g)) for match in matches: for aid in match: # find the core atom of each match, add it to the list of covered atoms and increase the count if mol.GetAtomWithIdx(aid).GetAtomicNum() == core and aid not in matched_atoms: matched_atoms.add(aid) # do some trick to either count groups to be deprotonated or possibly chargeable atoms count += val ** (1 - grouping) return count
[docs] def count_functional_groups(self, groups): """ Count the number of the provided functional group in the final molecule. Args: groups (Union[str, List[str]]): each string has to be a valid functional group representable by this tool, a valid SMILES string or a valid SMARTS string Returns: The number of matches of all functional groups. This count might overlap in the matched atoms. """ # in case of string input, put it into a one-element list if not isinstance(groups, list): groups = [groups] # generate smiles for this molecule and check it's not empty smiles = self.get_smiles() if smiles == "": raise ValueError("SMILES string for this glycan is empty, check if the IUPAC is convertable.") # generate molecule with RDKit and check it's not None mol = smiles2mol(smiles) if mol is None: raise ValueError("Generated SMILES is invalid, rdkit couldn't read it in.") fgs = [] for group in groups: # convert the functional group into a RDKit molecule if group in functional_groups: tmp = smiles2mol(functional_groups[group]) else: tmp = smiles2mol(group, sanitize=False) # check the functional groups for validity if tmp is None: raise ValueError(f"The functional group {group} cannot be parsed into a molecule. " f"Please make sure, it's a valid SMILES string or an implemented functional group.") fgs.append(tmp) # sum over matches of functional groups count = 0 hydro_mol = Chem.rdmolops.AddHs(mol) for g in fgs: matched = set() if 1 in set(a.GetAtomicNum() for a in g.GetAtoms()): matches = hydro_mol.GetSubstructMatches(g) else: matches = mol.GetSubstructMatches(g) for match in matches: match = set(match) if len(matched.intersection(match)) == 0: count += 1 matched = matched.union(match) return count
[docs] def get_smiles(self): """ Request the SMILES string of the parsed molecule. Returns: Generated SMILES string """ # return an empty SMILES if the output is required to represent all modifications, but it actually wouldn't if not self.tree_only and self.tree_full != self.full: return "" if self.glycan_smiles is None: self.parse_tree, self.tree_full = TreeWalker(self.factory, False).parse(self.grammar_tree) self.glycan_smiles = Merger(self.factory).merge(self.parse_tree, self.root_orientation, start=self.start) # self.glycan_smiles = self.glycan_smiles.replace("At", "O-") return self.glycan_smiles
[docs] def get_tree(self): """ Request the tree parsed from the IUPAC in this instance. Returns: The parsed tree with the single monomers in the nodes. """ return self.parse_tree
[docs] def save_dot(self, output, horizontal=False): """ Save the tree structure of the encoded glycan molecule into a dot file visualizing the graph of monomers. Args: output (str): path to store the DOT file in horizontal (bool): Show graph in horizontal orientation from left to right Returns: pydot graph object containing the graph """ if horizontal: graph = pydot.Dot("iupac_tree", rankdir="LR") else: graph = pydot.Dot("iupac_tree") for node in range(len(self.parse_tree.nodes)): graph.add_node(pydot.Node(node, label=self.parse_tree.nodes[node]["type"].get_name("full"))) for edge in self.parse_tree.edges(): graph.add_edge(pydot.Edge(*edge[::-1], label=self.parse_tree.get_edge_data(*edge)["type"])) graph.write(output) return graph
[docs] def parse(self): """ Adapter on the Lexer and Parser generated by ANTLR based on glyles/grammar/Glycan.g4. """ try: # parse the remaining structure description following the grammar, also add the dummy characters if not isinstance(self.iupac, str): raise ParseError("Only string input can be parsed: " + str(self.iupac)) self.iupac = prepare('#' + self.iupac + '#') stream = InputStream(data=self.iupac) lexer = IUPACLexer(stream) token = CommonTokenStream(lexer) parser = IUPACParser(token) lexer.removeErrorListeners() lexer.addErrorListener(GlyLESErrorListener()) parser.removeErrorListeners() parser.addErrorListener(GlyLESErrorListener()) try: self.grammar_tree = parser.start() except ParseError as pe: self.parse_tree = None self.glycan_smiles = "" raise pe from None # walk through the AST and parse the AST into a networkx representation of the glycan. self.parse_tree, self.tree_full = TreeWalker(self.factory, self.tree_only).parse(self.grammar_tree) # if the glycan should be parsed immediately, do so if not self.tree_only and self.tree_full and self.full: self.glycan_smiles = Merger(self.factory).merge(self.parse_tree, self.root_orientation, start=self.start) # catch any exception at glycan level to not destroy the whole pipeline because of one mis-formed glycan except ParseError as e: msg = e.__str__().replace("\n", " ") logging.error(f"A parsing error occurred with \"{self.iupac}\": Error message: {msg}") self.glycan_smiles = "" except Exception as e: msg = e.__str__().replace("\n", " ") logging.error(f"An unexpected exception occurred with \"{self.iupac}\". This glycan cannot be parsed. " f"Error message: {msg}") self.glycan_smiles = "" raise e