#!/usr/bin/env python3
import copy
import json
import numpy as np
import pandas as pd
import warnings
from .json_convert_utils import remove_optional_null_values
from ..pmo_builder.json_convert_utils import check_additional_columns_exist
[docs]class PMOPanelBuilder:
"""
Build PMO ``target_info`` and ``panel_info`` structures from a target table.
Wraps a dataframe of one-row-per-target panel data and converts it into the
nested dictionaries a PMO expects. Most users should call
:func:`panel_info_table_to_pmo` instead of using this class directly.
:param target_table: dataframe with one row per target
:param panel_name: name assigned to the panel
:param target_name_col: column holding the target names. Default: ``target_name``
:param forward_primers_seq_col: column holding the forward primer sequence. Default: ``fwd_primer``
:param reverse_primers_seq_col: column holding the reverse primer sequence. Default: ``rev_primer``
:param reaction_name_col: optional column naming which reaction each target
belongs to; if omitted, all targets go in a single reaction
:param reaction_name_col_delimiter: delimiter splitting the reaction column
into multiple reactions. Default: ``,``
:param forward_primers_start_col: optional column with the 0-based forward primer start
:param forward_primers_end_col: optional column with the 0-based forward primer end
:param reverse_primers_start_col: optional column with the 0-based reverse primer start
:param reverse_primers_end_col: optional column with the 0-based reverse primer end
:param insert_start_col: optional column with the 0-based insert start
:param insert_end_col: optional column with the 0-based insert end
:param chrom_col: optional chromosome column; required if any location columns are set
:param strand_col: optional strand column
:param ref_seq_col: optional reference-sequence column for the insert
:param gene_name_col: optional gene-name column
:param target_attributes_col: optional column of target attribute classifications
:param target_attributes_col_delimiter: delimiter splitting the attributes
column into multiple attributes. Default: ``,``
:param additional_target_info_cols: optional list of extra column names to
copy verbatim into each target dict
"""
def __init__(
self,
target_table: pd.DataFrame,
panel_name: str,
target_name_col: str = "target_name",
forward_primers_seq_col: str = "fwd_primer",
reverse_primers_seq_col: str = "rev_primer",
reaction_name_col: str | None = None,
reaction_name_col_delimiter: str = ",",
forward_primers_start_col: int | None = None,
forward_primers_end_col: int | None = None,
reverse_primers_start_col: int | None = None,
reverse_primers_end_col: int | None = None,
insert_start_col: int | None = None,
insert_end_col: int | None = None,
chrom_col: str | None = None,
strand_col: str | None = None,
ref_seq_col: str | None = None,
gene_name_col: str | None = None,
target_attributes_col: str | None = None,
target_attributes_col_delimiter: str = ",",
additional_target_info_cols: list | None = None,
):
self.target_table = target_table
self.panel_name = panel_name
self.target_name_col = target_name_col
self.forward_primers_seq_col = forward_primers_seq_col
self.reverse_primers_seq_col = reverse_primers_seq_col
self.reaction_name_col = reaction_name_col
self.reaction_name_col_delimiter = reaction_name_col_delimiter
self.forward_primers_start_col = forward_primers_start_col
self.forward_primers_end_col = forward_primers_end_col
self.reverse_primers_start_col = reverse_primers_start_col
self.reverse_primers_end_col = reverse_primers_end_col
self.insert_start_col = insert_start_col
self.insert_end_col = insert_end_col
self.chrom_col = chrom_col
self.strand_col = strand_col
self.ref_seq_col = ref_seq_col
self.gene_name_col = gene_name_col
self.target_attributes_col = target_attributes_col
self.target_attributes_col_delimiter = target_attributes_col_delimiter
self.additional_target_info_cols = additional_target_info_cols
self.location_info_cols = self.check_location_columns()
[docs] def check_location_columns(self):
"""
Validate the optional genomic-location column configuration.
If any location column is set, enforces that ``chrom_col`` is present and
that primer/insert start and end columns are supplied as pairs.
:raises ValueError: if location columns are set inconsistently
:return: the list of location columns if any were provided, otherwise None
"""
location_cols = [
self.forward_primers_start_col,
self.forward_primers_end_col,
self.reverse_primers_start_col,
self.reverse_primers_end_col,
self.insert_start_col,
self.insert_end_col,
self.chrom_col,
self.strand_col,
self.ref_seq_col,
]
if any(location_cols):
collect_warnings = []
if not self.chrom_col:
collect_warnings.append(
"If including location information (any of forward_primers_start_col, forward_primers_end_col, reverse_primers_start_col, reverse_primers_end_col, insert_start_col, insert_end_col) chrom_col must be set."
)
if (self.forward_primers_start_col is None) != (
self.forward_primers_end_col is None
):
collect_warnings.append(
"If one of forward_primers_start_col or forward_primers_end_col is set, then both must be."
)
if (self.reverse_primers_start_col is None) != (
self.reverse_primers_end_col is None
):
collect_warnings.append(
"If one of reverse_primers_start_col or reverse_primers_end_col is set, then both must be."
)
if (self.insert_start_col is None) != (self.insert_end_col is None):
collect_warnings.append(
"If one of insert_start_col or insert_end_col is set, then both must be."
)
if collect_warnings:
raise ValueError(
"Errors with location column configuration:\n- "
+ "\n- ".join(collect_warnings)
)
return location_cols
return None
[docs] def check_target_names_are_unique(self):
"""
Raise an exception if the target names are not unique
:return: Nothing
"""
duplications = self.target_table[
self.target_table[self.target_name_col].duplicated(keep=False)
]
if not duplications.empty:
raise ValueError(
f"The following target_ids are duplicated: {duplications[self.target_name_col].unique()}"
)
[docs] def check_unique_target_info(self, columns_to_check):
"""
Raise an exception if the target info is not unique
:param columns_to_check: the columns to check to ensure the target info is unique
:return: Nothing
"""
groups = (
self.target_table.groupby(columns_to_check)[self.target_name_col]
.apply(list)
.reset_index(name=self.target_name_col)
)
# Keep only groups where more than one target shares the same primer pair
duplicated_groups = groups[groups[self.target_name_col].str.len() > 1]
if not duplicated_groups.empty:
msg_lines = ["The following targets have duplicated information:"]
for _, row in duplicated_groups.iterrows():
cols_info = ", ".join(f"{col}={row[col]}" for col in columns_to_check)
targets = ", ".join(map(str, row[self.target_name_col]))
msg_lines.append(f"targets: {targets} → {cols_info}")
raise ValueError("\n".join(msg_lines))
[docs] def summarise_targets_missing_optional_info(self):
"""
Warn about targets missing optional location fields.
For each of insert, forward-primer, and reverse-primer locations that was
requested, finds targets with empty coordinate fields and emits a warning.
Targets listed here are skipped when their location block is built.
:return: a tuple ``(missing_insert_loc, missing_fwd_primer_loc,
missing_rev_primer_loc)``; each element is a list of target names, or
None if that location type was not requested
"""
missing_insert_loc = None
missing_fwd_primer_loc = None
missing_rev_primer_loc = None
def check_missing(name, cols):
missing = self.target_table[self.target_table[cols].isnull().any(axis=1)][
self.target_name_col
].tolist()
if len(missing) > 0:
warnings.warn(
f"{name} location information was not added for the following targets that had empty fields: {', '.join(missing)}"
)
return missing
missing_insert_loc = (
check_missing(
"Insert", [self.chrom_col, self.insert_start_col, self.insert_end_col]
)
if self.insert_start_col
else None
)
missing_fwd_primer_loc = (
check_missing(
"Forward primer",
[
self.chrom_col,
self.forward_primers_start_col,
self.forward_primers_end_col,
],
)
if self.forward_primers_start_col
else None
)
missing_rev_primer_loc = (
check_missing(
"Reverse primer",
[
self.chrom_col,
self.reverse_primers_start_col,
self.reverse_primers_end_col,
],
)
if self.reverse_primers_start_col
else None
)
return missing_insert_loc, missing_fwd_primer_loc, missing_rev_primer_loc
[docs] def build_target_info_dict(
self,
genome_id_col: str | None = None,
):
"""
Build the list of target_info dictionaries from the target table.
Validates target-name uniqueness and primer/location uniqueness, then
assembles one dict per target including primer sequences and, where
available, insert and primer genomic locations.
:param genome_id_col: optional column holding the genome id for each
target; if omitted, a genome_id of 0 is used
:return: a list of target_info dictionaries
"""
# Check targets before putting into JSON
(
forward_primers_start_col,
forward_primers_end_col,
reverse_primers_start_col,
reverse_primers_end_col,
insert_start_col,
insert_end_col,
chrom_col,
strand_col,
ref_seq_col,
) = self.location_info_cols if self.location_info_cols else [None] * 9
# Check target information in the dataframe
self.check_target_names_are_unique()
columns_to_check = [self.forward_primers_seq_col, self.reverse_primers_seq_col]
if self.location_info_cols:
columns_to_check += [col for col in self.location_info_cols if col]
self.check_unique_target_info(list(set(columns_to_check)))
(
missing_insert_loc,
missing_fwd_primer_loc,
missing_rev_primer_loc,
) = self.summarise_targets_missing_optional_info()
# Put targets together in a dictionary
targets_dicts = []
for _, row in self.target_table.iterrows():
target_name = row[self.target_name_col]
target_dict = {
"target_name": target_name,
}
if self.gene_name_col:
target_dict["gene_name"] = row[self.gene_name_col]
if self.target_attributes_col:
target_dict["target_attributes"] = row[
self.target_attributes_col
].split(self.target_attributes_col_delimiter)
if self.additional_target_info_cols:
for col in self.additional_target_info_cols:
value = row[col]
# Convert numpy types to native Python types
if isinstance(value, (np.integer, np.int64)):
value = int(value)
elif isinstance(value, (np.floating, np.float64)):
value = float(value)
elif pd.isna(value):
value = None
target_dict[col] = value
# Add insert location info if location_info_cols are provided
if insert_start_col and target_name not in missing_insert_loc:
if genome_id_col:
genome_id = int(row[genome_id_col])
else:
genome_id = 0
target_dict["insert_location"] = {
"genome_id": genome_id,
"chrom": row[chrom_col],
"start": int(row[insert_start_col]),
"end": int(row[insert_end_col]),
}
if strand_col and pd.notna(row[strand_col]):
target_dict["insert_location"]["strand"] = row[strand_col]
if ref_seq_col and pd.notna(row[ref_seq_col]):
target_dict["insert_location"]["ref_seq"] = row[ref_seq_col]
# Extract primer information for each row
fwd_primer_dict = {"seq": row[self.forward_primers_seq_col]}
rev_primer_dict = {"seq": row[self.reverse_primers_seq_col]}
if forward_primers_start_col and target_name not in missing_fwd_primer_loc:
if genome_id_col:
genome_id = int(row[genome_id_col])
else:
genome_id = 0
fwd_primer_dict["location"] = {
"genome_id": genome_id,
"chrom": row[chrom_col],
"end": int(row[forward_primers_start_col]),
"start": int(row[forward_primers_end_col]),
}
if strand_col and pd.notna(row[strand_col]):
fwd_primer_dict["location"]["strand"] = row[strand_col]
if reverse_primers_start_col and target_name not in missing_rev_primer_loc:
if genome_id_col:
genome_id = int(row[genome_id_col])
else:
genome_id = 0
rev_primer_dict["location"] = {
"genome_id": genome_id,
"chrom": row[chrom_col],
"end": int(row[reverse_primers_end_col]),
"start": int(row[reverse_primers_start_col]),
}
if strand_col and pd.notna(row[strand_col]):
rev_primer_dict["location"]["strand"] = row[strand_col]
target_dict["forward_primer"] = fwd_primer_dict
target_dict["reverse_primer"] = rev_primer_dict
targets_dicts.append(target_dict)
return targets_dicts
[docs] def build_panel_info_dict(self, targets_dict):
"""
Build the panel_info dictionary, grouping targets into reactions.
If no reaction column was configured, all targets are placed in a single
reaction named ``full``.
:param targets_dict: the target_info list from :meth:`build_target_info_dict`
:return: a panel_info dictionary with ``panel_name`` and ``reactions``,
where each reaction lists target indices into ``targets_dict``
"""
panel_dict = {"panel_name": self.panel_name, "reactions": []}
target_indices = dict()
for i, target_dict in enumerate(targets_dict):
target_indices[target_dict["target_name"]] = i
if self.reaction_name_col:
reactions = (
self.target_table[self.reaction_name_col]
.str.split(self.reaction_name_col_delimiter)
.explode()
.str.strip() # Remove leading/trailing whitespace
.unique()
)
else:
reactions = ["full"]
self.target_table["reaction"] = "full"
self.reaction_name_col = "reaction"
for reaction in reactions:
# Filter rows where the reaction column contains this reaction
matching_rows = self.target_table[
self.target_table[self.reaction_name_col]
.str.split(self.reaction_name_col_delimiter)
.apply(
lambda x: reaction in [item.strip() for item in x]
if isinstance(x, list)
else False
)
]
# Get the indices for these targets
target_indices_for_reaction = [
target_indices[target_name]
for target_name in matching_rows[self.target_name_col]
]
panel_dict["reactions"].append(
{
"reaction_name": reaction,
"panel_targets": target_indices_for_reaction,
}
)
return panel_dict
[docs]def check_genome_info(genome_info):
"""
Validate that genome info contains the required keys.
Accepts either a single genome dict or a list of them, and checks each for
the keys ``name``, ``genome_version``, ``taxon_id``, and ``url``.
:param genome_info: a genome dict or list of genome dicts
:raises TypeError: if genome_info is not a dict or list, or a list element is not a dict
:raises ValueError: if the list is empty or any entry is missing required keys
:return: Nothing
"""
if isinstance(genome_info, dict):
required_keys = {"name", "genome_version", "taxon_id", "url"}
missing_keys = required_keys - genome_info.keys()
if missing_keys:
raise ValueError(
f"genome_info missing required keys: {', '.join(missing_keys)}"
)
elif isinstance(genome_info, list):
if not genome_info:
raise ValueError("genome_info list cannot be empty")
required_keys = {"name", "genome_version", "taxon_id", "url"}
for i, genome_dict in enumerate(genome_info):
if not isinstance(genome_dict, dict):
raise TypeError(
f"genome_info[{i}] must be a dict, but got {type(genome_dict).__name__}"
)
missing_keys = required_keys - genome_dict.keys()
if missing_keys:
raise ValueError(
f"genome_info[{i}] missing required keys: {', '.join(missing_keys)}"
)
else:
raise TypeError(
f"genome_info must be a dict or list, but got {type(genome_info).__name__}"
)
[docs]def merge_panel_info_dicts(panel_info_dicts: list[dict]) -> dict:
"""
Merge multiple panel_info dictionaries produced by panel_info_table_to_pmo.
Target lists are concatenated (deduplicated by target_name) and all
genome references are collapsed so that genome identifiers remain valid
across the merged structure.
:param panel_info_dicts: a list of panel_info dicts, each with ``target_info``
and ``panel_info`` (and optionally ``targeted_genomes``)
:raises ValueError: if the list is empty, a dict lacks ``target_info``, or a
target has location data without accompanying ``targeted_genomes``
:return: a merged dict with ``panel_info`` and ``target_info`` keys, plus
``targeted_genomes`` if any genomes were present
"""
if not panel_info_dicts:
raise ValueError("panel_info_dicts must contain at least one entry.")
merged_targets: list[dict] = []
target_name_to_index: dict[str, int] = {}
merged_panels: list[dict] = []
merged_genomes: list[dict] = []
genome_signature_to_index: dict[str, int] = {}
def canonicalise_genome(genome: dict) -> str:
return json.dumps(genome, sort_keys=True)
def remap_genome_ids(target_entry: dict, mapping: dict[int, int]) -> None:
insert_loc = target_entry.get("insert_location")
if insert_loc and "genome_id" in insert_loc:
old_id = insert_loc["genome_id"]
if old_id in mapping:
insert_loc["genome_id"] = mapping[old_id]
for primer_key in ("forward_primer", "reverse_primer"):
primer = target_entry.get(primer_key)
if primer and isinstance(primer, dict):
primer_loc = primer.get("location")
if primer_loc and "genome_id" in primer_loc:
old_id = primer_loc["genome_id"]
if old_id in mapping:
primer_loc["genome_id"] = mapping[old_id]
for panel_dict in panel_info_dicts:
genome_mapping: dict[int, int] = {}
if "targeted_genomes" in panel_dict:
for idx, genome in enumerate(panel_dict["targeted_genomes"]):
signature = canonicalise_genome(genome)
if signature not in genome_signature_to_index:
genome_signature_to_index[signature] = len(merged_genomes)
merged_genomes.append(genome)
genome_mapping[idx] = genome_signature_to_index[signature]
if "target_info" not in panel_dict:
raise ValueError("panel_info_dict missing 'target_info'.")
for target in panel_dict["target_info"]:
target_name = target.get("target_name")
if target_name is None:
raise ValueError("Each target_info entry must include a 'target_name'.")
if target_name not in target_name_to_index:
target_copy = copy.deepcopy(target)
if "targeted_genomes" in panel_dict:
remap_genome_ids(target_copy, genome_mapping)
else:
# check to see if there is location data but no genomes loaded
if "insert_location" in target_copy:
raise ValueError(
"target"
+ target_name
+ " has insert_location but no targeted_genomes information is included"
)
if "location" in target_copy["forward_primer"]:
raise ValueError(
"target"
+ target_name
+ " has forward primer location but no targeted_genomes information is included"
)
if "location" in target_copy["reverse_primer"]:
raise ValueError(
"target"
+ target_name
+ " has reverse primer location but no targeted_genomes information is included"
)
target_name_to_index[target_name] = len(merged_targets)
merged_targets.append(target_copy)
for panel in panel_dict.get("panel_info", []):
remapped_panel = {"panel_name": panel["panel_name"], "reactions": []}
for reaction in panel.get("reactions", []):
remapped_targets = []
for target_idx in reaction["panel_targets"]:
target_name = panel_dict["target_info"][target_idx]["target_name"]
remapped_targets.append(target_name_to_index[target_name])
remapped_panel["reactions"].append(
{
"reaction_name": reaction["reaction_name"],
"panel_targets": remapped_targets,
}
)
merged_panels.append(remapped_panel)
ret = {"panel_info": merged_panels, "target_info": merged_targets}
if len(merged_genomes) > 0:
ret["targeted_genomes"] = merged_genomes
return ret
[docs]def panel_info_table_to_pmo(
target_table: pd.DataFrame,
panel_name: str,
genome_info: dict | list | None = None,
target_name_col: str = "target_name",
forward_primers_seq_col: str = "fwd_primer",
reverse_primers_seq_col: str = "rev_primer",
reaction_name_col: str | None = None,
reaction_name_col_delimiter: str = ",",
forward_primers_start_col: str | None = None,
forward_primers_end_col: str | None = None,
reverse_primers_start_col: str | None = None,
reverse_primers_end_col: str | None = None,
insert_start_col: str | None = None,
insert_end_col: str | None = None,
chrom_col: str | None = None,
strand_col: str | None = None,
ref_seq_col: str | None = None,
gene_name_col: str | None = None,
genome_id_col: str | None = None,
target_attributes_col: str | None = None,
target_attributes_col_delimiter: str = ",",
additional_target_info_cols: list | None = None,
):
"""
Convert a dataframe containing panel information into dictionary of targets and reference information
:param target_table: the dataframe containing the target information
:type target_table: pd.DataFrame
:param panel_name: the panel ID assigned to the panel
:type panel_name: str
:param genome_info: reference genome information, needed if the target info contains genome location
:type genome_info: dict or list, optional
:param target_name_col: the name of the column containing the target IDs. Default: target_name
:type target_name_col: str
:param forward_primers_seq_col: the name of the column containing the sequence of the forward primer. Default: fwd_primer
:type forward_primers_seq_col: str
:param reverse_primers_seq_col: the name of the column containing the sequence of the reverse primer. Default: rev_primer
:type reverse_primers_seq_col: str
:param reaction_name_col: the name of the column containing which reaction the target was part of. By default they will all be put in one reaction.
:type reaction_name_col: str, optional
:param reaction_name_col_delimiter: the delimiter used to split the reaction name column into multiple reactions. Default is a comma.
:type reaction_name_col_delimiter: str
:param forward_primers_start_col: the name of the column containing the 0-based start coordinate of the forward primer
:type forward_primers_start_col: str, optional
:param forward_primers_end_col: the name of the column containing the 0-based end coordinate of the forward primer
:type forward_primers_end_col: str, optional
:param reverse_primers_start_col: the name of the column containing the 0-based start coordinate of the reverse primer
:type reverse_primers_start_col: str, optional
:param reverse_primers_end_col: the name of the column containing the 0-based end coordinate of the reverse primer
:type reverse_primers_end_col: str, optional
:param insert_start_col: the name of the column containing the 0-based start coordinate of the insert
:type insert_start_col: str, optional
:param insert_end_col: the name of the column containing the 0-based end coordinate of the insert
:type insert_end_col: str, optional
:param chrom_col: the name of the column containing the chromosome for the target
:type chrom_col: str, optional
:param gene_name_col: the name of the column containing the gene id
:type gene_name_col: str, optional
:param strand_col: the name of the column containing the strand for the target
:type strand_col: str, optional
:param ref_seq_col: the name of the column containing the reference sequence for the insert
:type ref_seq_col: str, optional
:param target_attributes_col: a list of classification type for the primer target
:type target_attributes_col: str, optional
:param target_attributes_col_delimiter: the delimiter used to split the target attributes column into multiple attributes. Default is a comma.
:type target_attributes_col_delimiter: str
:param genome_id_col: the name of the column containing the genome ID (default is 0)
:type genome_id_col: str, optional
:param additional_target_info_cols: a list of additional column names to copy verbatim into each target information dictionary
:type additional_target_info_cols: list, optional
:return: a dict of the panel information
:rtype: dict
"""
if not isinstance(target_table, pd.DataFrame):
raise ValueError("target_table must be a pandas DataFrame.")
# Check additional columns if any are added
check_additional_columns_exist(target_table, additional_target_info_cols)
builder = PMOPanelBuilder(
target_table=target_table,
panel_name=panel_name,
target_name_col=target_name_col,
forward_primers_seq_col=forward_primers_seq_col,
reverse_primers_seq_col=reverse_primers_seq_col,
reaction_name_col=reaction_name_col,
reaction_name_col_delimiter=reaction_name_col_delimiter,
forward_primers_start_col=forward_primers_start_col,
forward_primers_end_col=forward_primers_end_col,
reverse_primers_start_col=reverse_primers_start_col,
reverse_primers_end_col=reverse_primers_end_col,
insert_start_col=insert_start_col,
insert_end_col=insert_end_col,
chrom_col=chrom_col,
strand_col=strand_col,
ref_seq_col=ref_seq_col,
gene_name_col=gene_name_col,
target_attributes_col=target_attributes_col,
target_attributes_col_delimiter=target_attributes_col_delimiter,
additional_target_info_cols=additional_target_info_cols,
)
# check and process genome_info if given
# Convert genome_info to list if it's a dict
if genome_info and isinstance(genome_info, dict):
genome_info = [genome_info]
if genome_info:
check_genome_info(genome_info)
# Create a dictionary of targets and panels
targets_dict = builder.build_target_info_dict(genome_id_col)
if not genome_info:
for target in targets_dict:
if "insert_location" in target:
raise Exception(
"insert_location is provided for "
+ target["target_name"]
+ " but no targeted_genomes is not provided."
)
if "location" in target["forward_primer"]:
raise Exception(
"location is provided for "
+ target["target_name"]
+ " but no targeted_genomes is not provided."
)
if "location" in target["reverse_primer"]:
raise Exception(
"location is provided for "
+ target["target_name"]
+ " but no targeted_genomes is not provided."
)
panel_dict = builder.build_panel_info_dict(targets_dict)
# Put together components
panel_info_dict = {
"panel_info": [panel_dict],
"target_info": targets_dict,
}
remove_optional_null_values(
panel_info_dict["target_info"],
["gene_name", "insert_location", "markers_of_interest", "target_attributes"],
)
if genome_info:
panel_info_dict["targeted_genomes"] = genome_info
remove_optional_null_values(
panel_info_dict["target_info"], ["chromosomes", "gff_url"]
)
return panel_info_dict