Source code for pmotools.pmo_builder.panel_information_to_pmo

#!/usr/bin/env python3
import copy
import json

import numpy as np
import pandas as pd
import warnings

from .json_convert_utils import remove_optional_null_values
from ..pmo_builder.json_convert_utils import check_additional_columns_exist


[docs]class PMOPanelBuilder: """ Build PMO ``target_info`` and ``panel_info`` structures from a target table. Wraps a dataframe of one-row-per-target panel data and converts it into the nested dictionaries a PMO expects. Most users should call :func:`panel_info_table_to_pmo` instead of using this class directly. :param target_table: dataframe with one row per target :param panel_name: name assigned to the panel :param target_name_col: column holding the target names. Default: ``target_name`` :param forward_primers_seq_col: column holding the forward primer sequence. Default: ``fwd_primer`` :param reverse_primers_seq_col: column holding the reverse primer sequence. Default: ``rev_primer`` :param reaction_name_col: optional column naming which reaction each target belongs to; if omitted, all targets go in a single reaction :param reaction_name_col_delimiter: delimiter splitting the reaction column into multiple reactions. Default: ``,`` :param forward_primers_start_col: optional column with the 0-based forward primer start :param forward_primers_end_col: optional column with the 0-based forward primer end :param reverse_primers_start_col: optional column with the 0-based reverse primer start :param reverse_primers_end_col: optional column with the 0-based reverse primer end :param insert_start_col: optional column with the 0-based insert start :param insert_end_col: optional column with the 0-based insert end :param chrom_col: optional chromosome column; required if any location columns are set :param strand_col: optional strand column :param ref_seq_col: optional reference-sequence column for the insert :param gene_name_col: optional gene-name column :param target_attributes_col: optional column of target attribute classifications :param target_attributes_col_delimiter: delimiter splitting the attributes column into multiple attributes. Default: ``,`` :param additional_target_info_cols: optional list of extra column names to copy verbatim into each target dict """ def __init__( self, target_table: pd.DataFrame, panel_name: str, target_name_col: str = "target_name", forward_primers_seq_col: str = "fwd_primer", reverse_primers_seq_col: str = "rev_primer", reaction_name_col: str | None = None, reaction_name_col_delimiter: str = ",", forward_primers_start_col: int | None = None, forward_primers_end_col: int | None = None, reverse_primers_start_col: int | None = None, reverse_primers_end_col: int | None = None, insert_start_col: int | None = None, insert_end_col: int | None = None, chrom_col: str | None = None, strand_col: str | None = None, ref_seq_col: str | None = None, gene_name_col: str | None = None, target_attributes_col: str | None = None, target_attributes_col_delimiter: str = ",", additional_target_info_cols: list | None = None, ): self.target_table = target_table self.panel_name = panel_name self.target_name_col = target_name_col self.forward_primers_seq_col = forward_primers_seq_col self.reverse_primers_seq_col = reverse_primers_seq_col self.reaction_name_col = reaction_name_col self.reaction_name_col_delimiter = reaction_name_col_delimiter self.forward_primers_start_col = forward_primers_start_col self.forward_primers_end_col = forward_primers_end_col self.reverse_primers_start_col = reverse_primers_start_col self.reverse_primers_end_col = reverse_primers_end_col self.insert_start_col = insert_start_col self.insert_end_col = insert_end_col self.chrom_col = chrom_col self.strand_col = strand_col self.ref_seq_col = ref_seq_col self.gene_name_col = gene_name_col self.target_attributes_col = target_attributes_col self.target_attributes_col_delimiter = target_attributes_col_delimiter self.additional_target_info_cols = additional_target_info_cols self.location_info_cols = self.check_location_columns()
[docs] def check_location_columns(self): """ Validate the optional genomic-location column configuration. If any location column is set, enforces that ``chrom_col`` is present and that primer/insert start and end columns are supplied as pairs. :raises ValueError: if location columns are set inconsistently :return: the list of location columns if any were provided, otherwise None """ location_cols = [ self.forward_primers_start_col, self.forward_primers_end_col, self.reverse_primers_start_col, self.reverse_primers_end_col, self.insert_start_col, self.insert_end_col, self.chrom_col, self.strand_col, self.ref_seq_col, ] if any(location_cols): collect_warnings = [] if not self.chrom_col: collect_warnings.append( "If including location information (any of forward_primers_start_col, forward_primers_end_col, reverse_primers_start_col, reverse_primers_end_col, insert_start_col, insert_end_col) chrom_col must be set." ) if (self.forward_primers_start_col is None) != ( self.forward_primers_end_col is None ): collect_warnings.append( "If one of forward_primers_start_col or forward_primers_end_col is set, then both must be." ) if (self.reverse_primers_start_col is None) != ( self.reverse_primers_end_col is None ): collect_warnings.append( "If one of reverse_primers_start_col or reverse_primers_end_col is set, then both must be." ) if (self.insert_start_col is None) != (self.insert_end_col is None): collect_warnings.append( "If one of insert_start_col or insert_end_col is set, then both must be." ) if collect_warnings: raise ValueError( "Errors with location column configuration:\n- " + "\n- ".join(collect_warnings) ) return location_cols return None
[docs] def check_target_names_are_unique(self): """ Raise an exception if the target names are not unique :return: Nothing """ duplications = self.target_table[ self.target_table[self.target_name_col].duplicated(keep=False) ] if not duplications.empty: raise ValueError( f"The following target_ids are duplicated: {duplications[self.target_name_col].unique()}" )
[docs] def check_unique_target_info(self, columns_to_check): """ Raise an exception if the target info is not unique :param columns_to_check: the columns to check to ensure the target info is unique :return: Nothing """ groups = ( self.target_table.groupby(columns_to_check)[self.target_name_col] .apply(list) .reset_index(name=self.target_name_col) ) # Keep only groups where more than one target shares the same primer pair duplicated_groups = groups[groups[self.target_name_col].str.len() > 1] if not duplicated_groups.empty: msg_lines = ["The following targets have duplicated information:"] for _, row in duplicated_groups.iterrows(): cols_info = ", ".join(f"{col}={row[col]}" for col in columns_to_check) targets = ", ".join(map(str, row[self.target_name_col])) msg_lines.append(f"targets: {targets}{cols_info}") raise ValueError("\n".join(msg_lines))
[docs] def summarise_targets_missing_optional_info(self): """ Warn about targets missing optional location fields. For each of insert, forward-primer, and reverse-primer locations that was requested, finds targets with empty coordinate fields and emits a warning. Targets listed here are skipped when their location block is built. :return: a tuple ``(missing_insert_loc, missing_fwd_primer_loc, missing_rev_primer_loc)``; each element is a list of target names, or None if that location type was not requested """ missing_insert_loc = None missing_fwd_primer_loc = None missing_rev_primer_loc = None def check_missing(name, cols): missing = self.target_table[self.target_table[cols].isnull().any(axis=1)][ self.target_name_col ].tolist() if len(missing) > 0: warnings.warn( f"{name} location information was not added for the following targets that had empty fields: {', '.join(missing)}" ) return missing missing_insert_loc = ( check_missing( "Insert", [self.chrom_col, self.insert_start_col, self.insert_end_col] ) if self.insert_start_col else None ) missing_fwd_primer_loc = ( check_missing( "Forward primer", [ self.chrom_col, self.forward_primers_start_col, self.forward_primers_end_col, ], ) if self.forward_primers_start_col else None ) missing_rev_primer_loc = ( check_missing( "Reverse primer", [ self.chrom_col, self.reverse_primers_start_col, self.reverse_primers_end_col, ], ) if self.reverse_primers_start_col else None ) return missing_insert_loc, missing_fwd_primer_loc, missing_rev_primer_loc
[docs] def build_target_info_dict( self, genome_id_col: str | None = None, ): """ Build the list of target_info dictionaries from the target table. Validates target-name uniqueness and primer/location uniqueness, then assembles one dict per target including primer sequences and, where available, insert and primer genomic locations. :param genome_id_col: optional column holding the genome id for each target; if omitted, a genome_id of 0 is used :return: a list of target_info dictionaries """ # Check targets before putting into JSON ( forward_primers_start_col, forward_primers_end_col, reverse_primers_start_col, reverse_primers_end_col, insert_start_col, insert_end_col, chrom_col, strand_col, ref_seq_col, ) = self.location_info_cols if self.location_info_cols else [None] * 9 # Check target information in the dataframe self.check_target_names_are_unique() columns_to_check = [self.forward_primers_seq_col, self.reverse_primers_seq_col] if self.location_info_cols: columns_to_check += [col for col in self.location_info_cols if col] self.check_unique_target_info(list(set(columns_to_check))) ( missing_insert_loc, missing_fwd_primer_loc, missing_rev_primer_loc, ) = self.summarise_targets_missing_optional_info() # Put targets together in a dictionary targets_dicts = [] for _, row in self.target_table.iterrows(): target_name = row[self.target_name_col] target_dict = { "target_name": target_name, } if self.gene_name_col: target_dict["gene_name"] = row[self.gene_name_col] if self.target_attributes_col: target_dict["target_attributes"] = row[ self.target_attributes_col ].split(self.target_attributes_col_delimiter) if self.additional_target_info_cols: for col in self.additional_target_info_cols: value = row[col] # Convert numpy types to native Python types if isinstance(value, (np.integer, np.int64)): value = int(value) elif isinstance(value, (np.floating, np.float64)): value = float(value) elif pd.isna(value): value = None target_dict[col] = value # Add insert location info if location_info_cols are provided if insert_start_col and target_name not in missing_insert_loc: if genome_id_col: genome_id = int(row[genome_id_col]) else: genome_id = 0 target_dict["insert_location"] = { "genome_id": genome_id, "chrom": row[chrom_col], "start": int(row[insert_start_col]), "end": int(row[insert_end_col]), } if strand_col and pd.notna(row[strand_col]): target_dict["insert_location"]["strand"] = row[strand_col] if ref_seq_col and pd.notna(row[ref_seq_col]): target_dict["insert_location"]["ref_seq"] = row[ref_seq_col] # Extract primer information for each row fwd_primer_dict = {"seq": row[self.forward_primers_seq_col]} rev_primer_dict = {"seq": row[self.reverse_primers_seq_col]} if forward_primers_start_col and target_name not in missing_fwd_primer_loc: if genome_id_col: genome_id = int(row[genome_id_col]) else: genome_id = 0 fwd_primer_dict["location"] = { "genome_id": genome_id, "chrom": row[chrom_col], "end": int(row[forward_primers_start_col]), "start": int(row[forward_primers_end_col]), } if strand_col and pd.notna(row[strand_col]): fwd_primer_dict["location"]["strand"] = row[strand_col] if reverse_primers_start_col and target_name not in missing_rev_primer_loc: if genome_id_col: genome_id = int(row[genome_id_col]) else: genome_id = 0 rev_primer_dict["location"] = { "genome_id": genome_id, "chrom": row[chrom_col], "end": int(row[reverse_primers_end_col]), "start": int(row[reverse_primers_start_col]), } if strand_col and pd.notna(row[strand_col]): rev_primer_dict["location"]["strand"] = row[strand_col] target_dict["forward_primer"] = fwd_primer_dict target_dict["reverse_primer"] = rev_primer_dict targets_dicts.append(target_dict) return targets_dicts
[docs] def build_panel_info_dict(self, targets_dict): """ Build the panel_info dictionary, grouping targets into reactions. If no reaction column was configured, all targets are placed in a single reaction named ``full``. :param targets_dict: the target_info list from :meth:`build_target_info_dict` :return: a panel_info dictionary with ``panel_name`` and ``reactions``, where each reaction lists target indices into ``targets_dict`` """ panel_dict = {"panel_name": self.panel_name, "reactions": []} target_indices = dict() for i, target_dict in enumerate(targets_dict): target_indices[target_dict["target_name"]] = i if self.reaction_name_col: reactions = ( self.target_table[self.reaction_name_col] .str.split(self.reaction_name_col_delimiter) .explode() .str.strip() # Remove leading/trailing whitespace .unique() ) else: reactions = ["full"] self.target_table["reaction"] = "full" self.reaction_name_col = "reaction" for reaction in reactions: # Filter rows where the reaction column contains this reaction matching_rows = self.target_table[ self.target_table[self.reaction_name_col] .str.split(self.reaction_name_col_delimiter) .apply( lambda x: reaction in [item.strip() for item in x] if isinstance(x, list) else False ) ] # Get the indices for these targets target_indices_for_reaction = [ target_indices[target_name] for target_name in matching_rows[self.target_name_col] ] panel_dict["reactions"].append( { "reaction_name": reaction, "panel_targets": target_indices_for_reaction, } ) return panel_dict
[docs]def check_genome_info(genome_info): """ Validate that genome info contains the required keys. Accepts either a single genome dict or a list of them, and checks each for the keys ``name``, ``genome_version``, ``taxon_id``, and ``url``. :param genome_info: a genome dict or list of genome dicts :raises TypeError: if genome_info is not a dict or list, or a list element is not a dict :raises ValueError: if the list is empty or any entry is missing required keys :return: Nothing """ if isinstance(genome_info, dict): required_keys = {"name", "genome_version", "taxon_id", "url"} missing_keys = required_keys - genome_info.keys() if missing_keys: raise ValueError( f"genome_info missing required keys: {', '.join(missing_keys)}" ) elif isinstance(genome_info, list): if not genome_info: raise ValueError("genome_info list cannot be empty") required_keys = {"name", "genome_version", "taxon_id", "url"} for i, genome_dict in enumerate(genome_info): if not isinstance(genome_dict, dict): raise TypeError( f"genome_info[{i}] must be a dict, but got {type(genome_dict).__name__}" ) missing_keys = required_keys - genome_dict.keys() if missing_keys: raise ValueError( f"genome_info[{i}] missing required keys: {', '.join(missing_keys)}" ) else: raise TypeError( f"genome_info must be a dict or list, but got {type(genome_info).__name__}" )
[docs]def merge_panel_info_dicts(panel_info_dicts: list[dict]) -> dict: """ Merge multiple panel_info dictionaries produced by panel_info_table_to_pmo. Target lists are concatenated (deduplicated by target_name) and all genome references are collapsed so that genome identifiers remain valid across the merged structure. :param panel_info_dicts: a list of panel_info dicts, each with ``target_info`` and ``panel_info`` (and optionally ``targeted_genomes``) :raises ValueError: if the list is empty, a dict lacks ``target_info``, or a target has location data without accompanying ``targeted_genomes`` :return: a merged dict with ``panel_info`` and ``target_info`` keys, plus ``targeted_genomes`` if any genomes were present """ if not panel_info_dicts: raise ValueError("panel_info_dicts must contain at least one entry.") merged_targets: list[dict] = [] target_name_to_index: dict[str, int] = {} merged_panels: list[dict] = [] merged_genomes: list[dict] = [] genome_signature_to_index: dict[str, int] = {} def canonicalise_genome(genome: dict) -> str: return json.dumps(genome, sort_keys=True) def remap_genome_ids(target_entry: dict, mapping: dict[int, int]) -> None: insert_loc = target_entry.get("insert_location") if insert_loc and "genome_id" in insert_loc: old_id = insert_loc["genome_id"] if old_id in mapping: insert_loc["genome_id"] = mapping[old_id] for primer_key in ("forward_primer", "reverse_primer"): primer = target_entry.get(primer_key) if primer and isinstance(primer, dict): primer_loc = primer.get("location") if primer_loc and "genome_id" in primer_loc: old_id = primer_loc["genome_id"] if old_id in mapping: primer_loc["genome_id"] = mapping[old_id] for panel_dict in panel_info_dicts: genome_mapping: dict[int, int] = {} if "targeted_genomes" in panel_dict: for idx, genome in enumerate(panel_dict["targeted_genomes"]): signature = canonicalise_genome(genome) if signature not in genome_signature_to_index: genome_signature_to_index[signature] = len(merged_genomes) merged_genomes.append(genome) genome_mapping[idx] = genome_signature_to_index[signature] if "target_info" not in panel_dict: raise ValueError("panel_info_dict missing 'target_info'.") for target in panel_dict["target_info"]: target_name = target.get("target_name") if target_name is None: raise ValueError("Each target_info entry must include a 'target_name'.") if target_name not in target_name_to_index: target_copy = copy.deepcopy(target) if "targeted_genomes" in panel_dict: remap_genome_ids(target_copy, genome_mapping) else: # check to see if there is location data but no genomes loaded if "insert_location" in target_copy: raise ValueError( "target" + target_name + " has insert_location but no targeted_genomes information is included" ) if "location" in target_copy["forward_primer"]: raise ValueError( "target" + target_name + " has forward primer location but no targeted_genomes information is included" ) if "location" in target_copy["reverse_primer"]: raise ValueError( "target" + target_name + " has reverse primer location but no targeted_genomes information is included" ) target_name_to_index[target_name] = len(merged_targets) merged_targets.append(target_copy) for panel in panel_dict.get("panel_info", []): remapped_panel = {"panel_name": panel["panel_name"], "reactions": []} for reaction in panel.get("reactions", []): remapped_targets = [] for target_idx in reaction["panel_targets"]: target_name = panel_dict["target_info"][target_idx]["target_name"] remapped_targets.append(target_name_to_index[target_name]) remapped_panel["reactions"].append( { "reaction_name": reaction["reaction_name"], "panel_targets": remapped_targets, } ) merged_panels.append(remapped_panel) ret = {"panel_info": merged_panels, "target_info": merged_targets} if len(merged_genomes) > 0: ret["targeted_genomes"] = merged_genomes return ret
[docs]def panel_info_table_to_pmo( target_table: pd.DataFrame, panel_name: str, genome_info: dict | list | None = None, target_name_col: str = "target_name", forward_primers_seq_col: str = "fwd_primer", reverse_primers_seq_col: str = "rev_primer", reaction_name_col: str | None = None, reaction_name_col_delimiter: str = ",", forward_primers_start_col: str | None = None, forward_primers_end_col: str | None = None, reverse_primers_start_col: str | None = None, reverse_primers_end_col: str | None = None, insert_start_col: str | None = None, insert_end_col: str | None = None, chrom_col: str | None = None, strand_col: str | None = None, ref_seq_col: str | None = None, gene_name_col: str | None = None, genome_id_col: str | None = None, target_attributes_col: str | None = None, target_attributes_col_delimiter: str = ",", additional_target_info_cols: list | None = None, ): """ Convert a dataframe containing panel information into dictionary of targets and reference information :param target_table: the dataframe containing the target information :type target_table: pd.DataFrame :param panel_name: the panel ID assigned to the panel :type panel_name: str :param genome_info: reference genome information, needed if the target info contains genome location :type genome_info: dict or list, optional :param target_name_col: the name of the column containing the target IDs. Default: target_name :type target_name_col: str :param forward_primers_seq_col: the name of the column containing the sequence of the forward primer. Default: fwd_primer :type forward_primers_seq_col: str :param reverse_primers_seq_col: the name of the column containing the sequence of the reverse primer. Default: rev_primer :type reverse_primers_seq_col: str :param reaction_name_col: the name of the column containing which reaction the target was part of. By default they will all be put in one reaction. :type reaction_name_col: str, optional :param reaction_name_col_delimiter: the delimiter used to split the reaction name column into multiple reactions. Default is a comma. :type reaction_name_col_delimiter: str :param forward_primers_start_col: the name of the column containing the 0-based start coordinate of the forward primer :type forward_primers_start_col: str, optional :param forward_primers_end_col: the name of the column containing the 0-based end coordinate of the forward primer :type forward_primers_end_col: str, optional :param reverse_primers_start_col: the name of the column containing the 0-based start coordinate of the reverse primer :type reverse_primers_start_col: str, optional :param reverse_primers_end_col: the name of the column containing the 0-based end coordinate of the reverse primer :type reverse_primers_end_col: str, optional :param insert_start_col: the name of the column containing the 0-based start coordinate of the insert :type insert_start_col: str, optional :param insert_end_col: the name of the column containing the 0-based end coordinate of the insert :type insert_end_col: str, optional :param chrom_col: the name of the column containing the chromosome for the target :type chrom_col: str, optional :param gene_name_col: the name of the column containing the gene id :type gene_name_col: str, optional :param strand_col: the name of the column containing the strand for the target :type strand_col: str, optional :param ref_seq_col: the name of the column containing the reference sequence for the insert :type ref_seq_col: str, optional :param target_attributes_col: a list of classification type for the primer target :type target_attributes_col: str, optional :param target_attributes_col_delimiter: the delimiter used to split the target attributes column into multiple attributes. Default is a comma. :type target_attributes_col_delimiter: str :param genome_id_col: the name of the column containing the genome ID (default is 0) :type genome_id_col: str, optional :param additional_target_info_cols: a list of additional column names to copy verbatim into each target information dictionary :type additional_target_info_cols: list, optional :return: a dict of the panel information :rtype: dict """ if not isinstance(target_table, pd.DataFrame): raise ValueError("target_table must be a pandas DataFrame.") # Check additional columns if any are added check_additional_columns_exist(target_table, additional_target_info_cols) builder = PMOPanelBuilder( target_table=target_table, panel_name=panel_name, target_name_col=target_name_col, forward_primers_seq_col=forward_primers_seq_col, reverse_primers_seq_col=reverse_primers_seq_col, reaction_name_col=reaction_name_col, reaction_name_col_delimiter=reaction_name_col_delimiter, forward_primers_start_col=forward_primers_start_col, forward_primers_end_col=forward_primers_end_col, reverse_primers_start_col=reverse_primers_start_col, reverse_primers_end_col=reverse_primers_end_col, insert_start_col=insert_start_col, insert_end_col=insert_end_col, chrom_col=chrom_col, strand_col=strand_col, ref_seq_col=ref_seq_col, gene_name_col=gene_name_col, target_attributes_col=target_attributes_col, target_attributes_col_delimiter=target_attributes_col_delimiter, additional_target_info_cols=additional_target_info_cols, ) # check and process genome_info if given # Convert genome_info to list if it's a dict if genome_info and isinstance(genome_info, dict): genome_info = [genome_info] if genome_info: check_genome_info(genome_info) # Create a dictionary of targets and panels targets_dict = builder.build_target_info_dict(genome_id_col) if not genome_info: for target in targets_dict: if "insert_location" in target: raise Exception( "insert_location is provided for " + target["target_name"] + " but no targeted_genomes is not provided." ) if "location" in target["forward_primer"]: raise Exception( "location is provided for " + target["target_name"] + " but no targeted_genomes is not provided." ) if "location" in target["reverse_primer"]: raise Exception( "location is provided for " + target["target_name"] + " but no targeted_genomes is not provided." ) panel_dict = builder.build_panel_info_dict(targets_dict) # Put together components panel_info_dict = { "panel_info": [panel_dict], "target_info": targets_dict, } remove_optional_null_values( panel_info_dict["target_info"], ["gene_name", "insert_location", "markers_of_interest", "target_attributes"], ) if genome_info: panel_info_dict["targeted_genomes"] = genome_info remove_optional_null_values( panel_info_dict["target_info"], ["chromosomes", "gff_url"] ) return panel_info_dict