Source code for rxn_insight.database

import pandas as pd
from tqdm import tqdm
from rxn_insight.reaction import Reaction
from rxn_insight.utils import curate_smirks, get_reaction_template, make_rdkit_fp
from rxnmapper import RXNMapper
from typing import Union, Tuple, List
from importlib import resources


[docs] class Database: """ A class to manage and analyze reaction datasets, providing functionalities for creating databases, analyzing reactions, and saving results. Example: >>> from rxn_insight.database import Database >>> import pandas as pd >>> # Create a sample DataFrame >>> data = { ... "reaction": ["OB(O)c1ccccc1.Brc1ccccc1>>c1ccc(-c2ccccc2)cc1"], ... "solvent": ["CN(C)C=O"], ... "reagent": ["F[Cs]"], ... "catalyst": ["[Pd]"], ... "yield": [85], ... "reference": ["Ref1"] ... } >>> df = pd.DataFrame(data) >>> # Initialize a Database object >>> db = Database() >>> # Create a database from the DataFrame >>> reaction_df = db.create_database_from_df( ... df, ... reaction_column="reaction", ... solvent_column="solvent", ... reagent_column="reagent", ... catalyst_column="catalyst", ... yield_column="yield", ... ref_column="reference" ... ) """ def __init__(self, df: Union[pd.DataFrame, None] = None): """ Initializes a Database object with an optional DataFrame. Args: df: An optional pandas DataFrame containing reaction data. """ if df is None: self.df = pd.DataFrame({}) else: self.df = df self.skipped_reactions = [] self.class_distribution = pd.DataFrame({}) self.name_distribution = pd.DataFrame({})
[docs] def create_database_from_df( self, df: pd.DataFrame, reaction_column: str, solvent_column: str = "SOLVENT", reagent_column: str = "REAGENT", catalyst_column: str = "CATALYST", yield_column: str = "YIELD", ref_column: str = "REF" ) -> pd.DataFrame: """ Creates a reaction database from a given DataFrame. Args: df: A DataFrame containing reaction data. reaction_column: Name of the column containing reaction SMILES. solvent_column: Name of the solvent column (default: "SOLVENT"). reagent_column: Name of the reagent column (default: "REAGENT"). catalyst_column: Name of the catalyst column (default: "CATALYST"). yield_column: Name of the yield column (default: "YIELD"). ref_column: Name of the reference column (default: "REF"). Returns: A DataFrame with analyzed reaction data. """ all_cols = ["SOLVENT", "REAGENT", "CATALYST", "YIELD", "REF"] df_protocol = df.rename(columns={reaction_column: "REACTION", solvent_column: "SOLVENT", reagent_column: "REAGENT", catalyst_column: "CATALYST", yield_column: "YIELD", ref_column: "REF"}) for col in all_cols: if col not in df_protocol.keys(): df_protocol[col] = "not-reported" df, skipped_reactions = analyze_reactions(df_protocol) self.df = df self.skipped_reactions = skipped_reactions self.class_distribution = calculate_class_distribution(df) self.name_distribution = calculate_name_distribution(df) return df
[docs] def create_database_from_csv( self, fname: str, reaction_column: str, solvent_column: str = "SOLVENT", reagent_column: str = "REAGENT", catalyst_column: str = "CATALYST", yield_column: str = "YIELD", ref_column: str = "REF" ) -> pd.DataFrame: """ Creates a reaction database from a CSV file. Args: fname: Path to the CSV file. reaction_column: Name of the column containing reaction SMILES. solvent_column: Name of the solvent column (default: "SOLVENT"). reagent_column: Name of the reagent column (default: "REAGENT"). catalyst_column: Name of the catalyst column (default: "CATALYST"). yield_column: Name of the yield column (default: "YIELD"). ref_column: Name of the reference column (default: "REF"). Returns: A DataFrame with analyzed reaction data. """ df_csv = pd.read_csv(fname, index_col=None) all_cols = ["SOLVENT", "REAGENT", "CATALYST", "YIELD", "REF"] df_protocol = df_csv.rename(columns={reaction_column: "REACTION", solvent_column: "SOLVENT", reagent_column: "REAGENT", catalyst_column: "CATALYST", yield_column: "YIELD", ref_column: "REF"}) for col in all_cols: if col not in df_protocol.keys(): df_protocol[col] = "not-reported" df, skipped_reactions = analyze_reactions(df_protocol) self.df = df self.skipped_reactions = skipped_reactions self.class_distribution = calculate_class_distribution(df) self.name_distribution = calculate_name_distribution(df) return df
[docs] def save_to_parquet(self, fname: str): """ Saves the reaction database to a Parquet file. Args: fname: The name of the output file (without extension). """ self.df.to_parquet(f"{fname}.gzip")
[docs] def save_to_csv(self, fname: str): """ Saves the reaction database to a CSV file. Args: fname: The name of the output file (without extension). """ self.df.to_csv(f"{fname}.csv")
[docs] def save_to_excel(self, fname: str): """ Saves the reaction database to an Excel file. Args: fname: The name of the output file (without extension). """ self.df.to_excel(f"{fname}.xlsx")
[docs] def get_class_distribution(self): """ Retrieves the class distribution of reactions in the database. Returns: A DataFrame summarizing the reaction class distribution. """ return self.class_distribution
[docs] def get_name_distribution(self): """ Retrieves the distribution of reaction names in the database. Returns: A DataFrame summarizing the reaction name distribution. """ return self.name_distribution
[docs] def analyze_reactions(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]: """ Analyzes a DataFrame of reactions to extract detailed information. Args: df: A DataFrame with reaction data. Returns: A tuple containing the updated DataFrame and a list of skipped reactions. """ headers = [ 'REACTANTS', 'PRODUCTS', 'SANITIZED_REACTION', 'MAPPED_REACTION', 'N_REACTANTS', 'N_PRODUCTS', 'FG_REACTANTS', 'FG_PRODUCTS', 'PARTICIPATING_RINGS_REACTANTS', 'PARTICIPATING_RINGS_PRODUCTS', 'ALL_RINGS_PRODUCTS', 'BY-PRODUCTS', 'CLASS', 'NAME', 'TAG', 'TAG2', 'SCAFFOLD', 'rxn_str_patt_fp', 'rxn_dif_patt_fp', 'rxn_str_morgan_fp', 'rxn_dif_morgan_fp', 'TEMPLATE' ] df[headers] = "" rxn_mapper = RXNMapper() with resources.path( f"{__package__}.json", "functional_groups.json" ) as path: fg_db = pd.read_json(path, orient="records", lines=True) with resources.path( f"{__package__}.json", "smirks.json" ) as path: smirks = pd.read_json(path, orient="records", lines=True) smirks_db = curate_smirks(smirks) bad_reactions = [] for i in tqdm(df.index, desc="Analyzing all reactions..."): try: rxn = Reaction(df["REACTION"][i], solvent=df["SOLVENT"][i], reagent=df["REAGENT"][i], catalyst=df["CATALYST"][i], ref=df["REF"][i], rxn_mapper=rxn_mapper, smirks=smirks_db, fg=fg_db) ri = rxn.get_reaction_info() for header in headers: if header in list(ri.keys()): if type(ri[header]) is list: df.loc[i, header] = ".".join(ri[header]) else: df.loc[i, header] = ri[header] df.loc[i, "SANITIZED_REACTION"] = ri["REACTION"] df.loc[i, "TAG2"] = rxn.give_broad_tag() df.loc[i, "TEMPLATE"] = get_reaction_template(ri["MAPPED_REACTION"], 2, 2) df.loc[i, "REACTANTS"], df.loc[i, "PRODUCTS"] = ri["REACTION"].split(">>") df.loc[i, "rxn_str_patt_fp"] = make_rdkit_fp(ri["REACTION"], fp="MACCS", concatenate=True) df.loc[i, "rxn_dif_patt_fp"] = make_rdkit_fp(ri["REACTION"], fp="MACCS", concatenate=False) df.loc[i, "rxn_str_morgan_fp"] = make_rdkit_fp(ri["REACTION"], fp="Morgan", concatenate=True) df.loc[i, "rxn_dif_morgan_fp"] = make_rdkit_fp(ri["REACTION"], fp="Morgan", concatenate=False) except KeyboardInterrupt: raise except Exception as e: print(f"Error at reaction nr {i}, {df['REACTION'][i]}", e) bad_reactions.append(df['REACTION'][i]) df = df.drop(i) return df, bad_reactions
[docs] def calculate_class_distribution(df: pd.DataFrame) -> pd.DataFrame: """ Calculates the distribution of reaction classes. Args: df: A DataFrame containing reaction data. Returns: A DataFrame summarizing reaction class counts. """ class_dict = {"CLASS": [], "COUNT": []} classes = ['Acylation', 'Heteroatom Alkylation and Arylation', 'Aromatic Heterocycle Formation', 'C-C Coupling', 'Deprotection', 'Protection', 'Functional Group Interconversion', 'Functional Group Addition', 'Reduction', 'Oxidation', 'Miscellaneous'] all_classes = df["CLASS"].to_list() for c in classes: class_dict["CLASS"].append(c) class_dict["COUNT"].append(all_classes.count(c)) df_class = pd.DataFrame(class_dict) return df_class
[docs] def calculate_name_distribution(df: pd.DataFrame) -> pd.DataFrame: """ Calculates the distribution of reaction names. Args: df: A DataFrame containing reaction data. Returns: A DataFrame summarizing reaction name counts. """ names_dict = {"NAME": [], "COUNT": []} all_names = df["NAME"].to_list() unique_names = list(set(all_names)) for name in unique_names: names_dict["NAME"].append(name) names_dict["COUNT"].append(all_names.count(name)) df_names = pd.DataFrame(names_dict) return df_names