Source code for pmotools.pmo_engine.pmo_exporter
#!/usr/bin/env python3
import copy
import json
import os
from collections import defaultdict
from typing import NamedTuple
import pandas as pd
from openpyxl.utils import get_column_letter
from dataclasses import dataclass
from pmotools.pmo_engine.pmo_checker import PMOChecker
from pmotools.pmo_engine.pmo_processor import PMOProcessor
from pmotools import __version__ as __pmotools_version__
[docs]class BedLoc(NamedTuple):
"""
A single BED-format genomic location.
Used when extracting target / panel insert locations out of a PMO so they
can be written to a BED file.
:ivar chrom: chromosome / contig name
:ivar start: 0-based start position
:ivar end: end position (exclusive)
:ivar name: target name
:ivar score: BED score column; here the insert length (``end - start``)
:ivar strand: ``+`` or ``-``
:ivar ref_seq: reference sequence for the insert, empty string if not loaded
:ivar extra_info: free-text key/value annotation, e.g. genome name/version
"""
chrom: str
start: int
end: int
name: str
score: float
strand: str
ref_seq: str
extra_info: str
[docs]class PMOExporter(object):
"""
A collection of functions to export information out of a PMO
"""
@staticmethod
def _is_primitive(x) -> bool:
"""
Check whether a value is a primitive that can be written directly to a table cell.
:param x: the value to check
:return: True if ``x`` is a str, int, float, bool, or None
"""
return isinstance(x, (str, int, float, bool)) or x is None
@staticmethod
def _is_primitive_list(x) -> bool:
"""
Check whether a value is a list or tuple containing only primitives.
:param x: the value to check
:return: True if ``x`` is a list/tuple and every element is a primitive
(see :meth:`is_primitive`)
"""
return isinstance(x, (list, tuple)) and all(
PMOExporter._is_primitive(i) for i in x
)
@staticmethod
def _is_exportable(x) -> bool:
"""
Check whether a value can be exported to a flat table.
A value is exportable if it is a primitive or a list/tuple of primitives;
complex nested objects (e.g. TravelInfo, parasite densities) are not.
:param x: the value to check
:return: True if ``x`` is a primitive or a primitive list
"""
return PMOExporter._is_primitive(x) or PMOExporter._is_primitive_list(x)
[docs] @staticmethod
def export_specimen_travel_meta_table(
pmodata, separator: str = ","
) -> pd.DataFrame:
"""
Export the specimen meta information of a PMO to a dataframe
Currently avoiding exporting values of complex object types like TravelInfo or Parasite densities, best to export such values in their own tables
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the specimen metadata
"""
rows = []
for specimen in pmodata["specimen_info"]:
if "travel_out_six_month" in specimen:
for travel_meta in specimen["travel_out_six_month"]:
export_row = {"specimen_name": specimen["specimen_name"]}
for key, value in travel_meta.items():
if PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_specimen_meta_table(pmodata, separator: str = ",") -> pd.DataFrame:
"""
Export the specimen meta information of a PMO to a dataframe
Currently avoiding exporting values of complex object types like TravelInfo or Parasite densities, best to export such values in their own tables
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the specimen metadata
"""
rows = []
for specimen in pmodata["specimen_info"]:
export_row = {}
for key, value in specimen.items():
if "project_id" == key:
export_row["project_name"] = pmodata["project_info"][value][
"project_name"
]
elif PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_library_sample_meta_table(pmodata, separator: str = ",") -> pd.DataFrame:
"""
Export the library_sample meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the library_sample metadata
"""
rows = []
for library_sample in pmodata["library_sample_info"]:
export_row = {}
for key, value in library_sample.items():
if "sequencing_info_id" == key:
export_row["sequencing_info_name"] = pmodata["sequencing_info"][
value
]["sequencing_info_name"]
elif "specimen_id" == key:
export_row["specimen_name"] = pmodata["specimen_info"][value][
"specimen_name"
]
elif "panel_id" == key:
export_row["panel_name"] = pmodata["panel_info"][value][
"panel_name"
]
elif PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_bioinformatics_run_info_meta_table(
pmodata, separator: str = ","
) -> pd.DataFrame:
"""
Export the bioinformatics_run_info meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the library_sample metadata
"""
rows = []
if "bioinformatics_run_info" not in pmodata.keys():
raise ValueError("no bioinformatics_run_info found in input PMO")
run_id = 0
for bioinformatics_run_info in pmodata["bioinformatics_run_info"]:
export_row = {}
export_row["run_id"] = run_id
run_id += 1
for key, value in bioinformatics_run_info.items():
if PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_bioinformatics_methods_info_meta_table(
pmodata, separator: str = ","
) -> pd.DataFrame:
"""
Export the bioinformatics_methods_info meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the library_sample metadata
"""
# @todo write pytest export_bioinformatics_methods_info_meta_table
rows = []
if "bioinformatics_methods_info" not in pmodata.keys():
raise ValueError("no bioinformatics_methods_info found in input PMO")
bioinformatics_methods_id = 0
for bioinformatics_methods_info in pmodata["bioinformatics_methods_info"]:
bioinformatics_methods_id += 1
export_row = {}
for key, value in bioinformatics_methods_info.items():
export_row["bioinformatics_methods_id"] = bioinformatics_methods_id
if PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
method_count = 0
for method in bioinformatics_methods_info["methods"]:
method_count += 1
method_export_row = copy.deepcopy(export_row)
method_export_row["method_id"] = method_count
for method_key in method.keys():
method_export_row[method_key] = method[method_key]
rows.append(method_export_row)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_sequencing_info_meta_table(
pmodata, separator: str = ","
) -> pd.DataFrame:
"""
Export the sequencing_info meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the sequencing_info metadata
"""
# check to see sequencing_info is loaded
if "sequencing_info" not in pmodata.keys():
raise ValueError("no sequencing_info found in input PMO")
rows = []
for sequencing_info in pmodata["sequencing_info"]:
export_row = {}
for key, value in sequencing_info.items():
if PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_project_info_meta_table(pmodata, separator: str = ",") -> pd.DataFrame:
"""
Export the project_info meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the project_info metadata
"""
# check to see sequencing_info is loaded
if "project_info" not in pmodata.keys():
raise ValueError("no project_info found in input PMO")
rows = []
for project_info in pmodata["project_info"]:
export_row = {}
for key, value in project_info.items():
if PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_panel_info_meta_table(pmodata, separator: str = ",") -> pd.DataFrame:
"""
Export the panel meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the panel metadata
"""
rows = []
for panel_info in pmodata["panel_info"]:
export_row = {}
for key, value in panel_info.items():
if PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
reactions_for_target = defaultdict(list)
for reaction in panel_info["reactions"]:
for target_id in reaction["panel_targets"]:
reactions_for_target[
pmodata["target_info"][target_id]["target_name"]
].append(reaction["reaction_name"])
for target, reactions in reactions_for_target.items():
export_row_per_target = copy.deepcopy(export_row)
export_row_per_target["target_name"] = target
export_row_per_target["reaction_name"] = separator.join(reactions)
rows.append(export_row_per_target)
return pd.DataFrame(rows)
[docs] @staticmethod
def export_target_info_meta_table(pmodata, separator: str = ",") -> pd.DataFrame:
"""
Export the target meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the panel metadata
"""
rows = []
for target_info in pmodata["target_info"]:
export_row = {}
for key, value in target_info.items():
if "forward_primer" == key:
export_row["forward_primer_seq"] = target_info["forward_primer"][
"seq"
]
if "location" in target_info["forward_primer"]:
for primer_loc_key in target_info["forward_primer"][
"location"
].keys():
export_row[
"forward_primer_" + primer_loc_key
] = target_info["forward_primer"]["location"][
primer_loc_key
]
elif "reverse_primer" == key:
export_row["reverse_primer_seq"] = target_info["reverse_primer"][
"seq"
]
if "location" in target_info["reverse_primer"]:
for primer_loc_key in target_info["reverse_primer"][
"location"
].keys():
export_row[
"reverse_primer_" + primer_loc_key
] = target_info["reverse_primer"]["location"][
primer_loc_key
]
elif "insert_location" == key:
for insert_key in value.keys():
export_row["insert_" + insert_key] = value[insert_key]
elif PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
df = pd.DataFrame(rows)
priority_cols = ["target_name", "forward_primer_seq", "reverse_primer_seq"]
leading = [c for c in priority_cols if c in df.columns]
rest = sorted(c for c in df.columns if c not in priority_cols)
return df[leading + rest]
[docs] @staticmethod
def export_pmo_header_table(pmodata, separator: str = ",") -> pd.DataFrame:
"""
Export the pmo header meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the genomes metadata
"""
rows = []
if "pmo_header" not in pmodata.keys():
raise ValueError("no pmo_header found in input PMO")
export_row = {}
for key, value in pmodata["pmo_header"].items():
if "generation_method" == key:
export_row["generation_method.program_version"] = value[
"program_version"
]
export_row["generation_method.program_name"] = value["program_name"]
elif PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
df = pd.DataFrame(rows)
priority_cols = ["pmo_version"]
leading = [c for c in priority_cols if c in df.columns]
rest = sorted(c for c in df.columns if c not in priority_cols)
return df[leading + rest]
[docs] @staticmethod
def export_targeted_genomes_meta_table(
pmodata, separator: str = ","
) -> pd.DataFrame:
"""
Export the targeted genomes meta information of a PMO to a dataframe
:param pmodata: the pmo export the information from
:param separator: the separator to use for list values
:return: a pandas dataframe of the genomes metadata
"""
rows = []
genome_id = 0
if "targeted_genomes" not in pmodata.keys():
raise ValueError("no targeted_genomes found in input PMO")
for genome_info in pmodata["targeted_genomes"]:
export_row = {}
export_row["genome_id"] = genome_id
genome_id += 1
for key, value in genome_info.items():
if PMOExporter._is_primitive(value):
export_row[key] = value
elif PMOExporter._is_primitive_list(value):
export_row[key] = separator.join(str(v) for v in value)
rows.append(export_row)
df = pd.DataFrame(rows)
priority_cols = ["name", "genome_version", "taxon_id", "genome_id", "url"]
leading = [c for c in priority_cols if c in df.columns]
rest = sorted(c for c in df.columns if c not in priority_cols)
return df[leading + rest]
[docs] @staticmethod
def write_bed_locs(bed_locs: list[BedLoc], fnp, add_header: bool = False):
"""
Write out a list of BedLoc to a file, will auto overwrite it
:param bed_locs: a list of BedLoc
:param fnp: output file path, will be overwritten if it exists
:param add_header: add header of #chrom,start end,name,score,strand,ref_seq,extra_info, starts with comment so tools will treat it as a comment line
"""
with open(fnp, "w") as f:
if add_header:
f.write(
"\t".join(
[
"#chrom",
"start",
"end",
"name",
"score",
"strand",
"ref_seq",
"extra_info",
]
)
)
for bed_loc in bed_locs:
f.write(
"\t".join(
[
bed_loc.chrom,
str(bed_loc.start),
str(bed_loc.end),
bed_loc.name,
str(bed_loc.score),
bed_loc.strand,
str(bed_loc.ref_seq),
bed_loc.extra_info,
]
)
)
f.write("\n")
[docs] @staticmethod
def extract_targets_insert_bed_loc(
pmodata, select_target_ids: list[int] = None, sort_output: bool = True
):
"""
Extract out of a PMO the insert location for targets, will add ref seq if loaded into PMO
:param pmodata: the PMO to extract from
:param select_target_ids: a list of target ids to select, if None will select all targets
:param sort_output: whether to sort output by genomic location
:return: a list of target inserts, with named tuples with fields: chrom, start, end, name, score, strand, ref_seq, extra_info
"""
bed_loc_out = []
if select_target_ids is None:
select_target_ids = list(range(len(pmodata["target_info"])))
for target_id in select_target_ids:
tar = pmodata["target_info"][target_id]
if "insert_location" not in tar:
raise Exception(
"no insert_location in pmodata for target id "
+ str(target_id)
+ " target_name "
+ str(tar["target_name"])
+ ", cannot extract insert_location"
)
genome_info = pmodata["targeted_genomes"][
tar["insert_location"]["genome_id"]
]
genome_name_version = (
genome_info["name"] + "_" + genome_info["genome_version"]
)
extra_info = (
str("[") + str("genome_name_version=") + genome_name_version + ";]"
)
strand = (
"+"
if "strand" not in tar["insert_location"]
else tar["insert_location"]["strand"]
)
ref_seq = (
""
if "ref_seq" not in tar["insert_location"]
else tar["insert_location"]["ref_seq"]
)
bed_loc_out.append(
BedLoc(
tar["insert_location"]["chrom"],
tar["insert_location"]["start"],
tar["insert_location"]["end"],
tar["target_name"],
tar["insert_location"]["end"] - tar["insert_location"]["start"],
strand,
ref_seq,
extra_info,
)
)
if sort_output:
return sorted(bed_loc_out, key=lambda bed: (bed.chrom, bed.start, bed.end))
return bed_loc_out
[docs] @staticmethod
def extract_panels_insert_bed_loc(
pmodata, select_panel_ids: list[int] = None, sort_output: bool = True
):
"""
Extract out of a PMO the insert location for panels, will add ref seq if loaded into PMO
:param pmodata: the PMO to extract from
:param select_panel_ids: a list of panels ids to select, if None will select all panels
:param sort_output: whether to sort output by genomic location
:return: a list of target inserts, with named tuples with fields: chrom, start, end, name, score, strand, ref_seq, extra_info
"""
bed_loc_out = {}
if select_panel_ids is None:
select_panel_ids = list(range(len(pmodata["panel_info"])))
for panel_id in select_panel_ids:
bed_loc_out_per_panel = []
for reaction_id in range(len(pmodata["panel_info"][panel_id]["reactions"])):
for target_id in pmodata["panel_info"][panel_id]["reactions"][
reaction_id
]["panel_targets"]:
tar = pmodata["target_info"][target_id]
if "insert_location" not in tar:
raise Exception(
"no insert_location in pmodata for target id "
+ str(target_id)
+ " target_name "
+ str(tar["target_name"])
+ ", cannot extract insert_location"
)
genome_info = pmodata["targeted_genomes"][
tar["insert_location"]["genome_id"]
]
genome_name_version = (
genome_info["name"] + "_" + genome_info["genome_version"]
)
extra_info = (
str("[")
+ "genome_name_version="
+ genome_name_version
+ ";"
+ "panel="
+ pmodata["panel_info"][panel_id]["panel_name"]
+ ";"
+ "reaction="
+ pmodata["panel_info"][panel_id]["reactions"][reaction_id][
"reaction_name"
]
+ ";"
+ "]"
)
strand = (
"+"
if "strand" not in tar["insert_location"]
else tar["insert_location"]["strand"]
)
ref_seq = (
""
if "ref_seq" not in tar["insert_location"]
else tar["insert_location"]["ref_seq"]
)
bed_loc_out_per_panel.append(
BedLoc(
tar["insert_location"]["chrom"],
tar["insert_location"]["start"],
tar["insert_location"]["end"],
tar["target_name"],
tar["insert_location"]["end"]
- tar["insert_location"]["start"],
strand,
ref_seq,
extra_info,
)
)
if sort_output:
return sorted(
bed_loc_out_per_panel,
key=lambda bed: (bed.chrom, bed.start, bed.end),
)
bed_loc_out[panel_id] = bed_loc_out_per_panel
return bed_loc_out
[docs] @staticmethod
def extract_alleles_per_sample_table(
pmodata,
additional_specimen_info_fields: list[str] = None,
additional_library_sample_info_fields: list[str] = None,
additional_microhap_fields: list[str] = None,
additional_representative_info_fields: list[str] = None,
default_base_col_names: list[str] = [
"library_sample_name",
"target_name",
"seq",
],
jsonschema_fnp=os.path.join(
os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
),
"schemas/",
f"portable_microhaplotype_object_v{__pmotools_version__}.schema.json",
),
validate_pmo: bool = False,
) -> pd.DataFrame:
"""
Create a pd.Dataframe of sample, target and allele. Can optionally add on any other additional fields
:param pmodata: the data to write from
:param additional_specimen_info_fields: any additional fields to write from the specimen_info object
:param additional_library_sample_info_fields: any additional fields to write from the library_samples object
:param additional_microhap_fields: any additional fields to write from the microhap object
:param additional_representative_info_fields: any additional fields to write from the representative_microhaplotype_sequences object
:param default_base_col_names: The default column name for the library_sample_name, target_name and seq
:param jsonschema_fnp: path to the jsonschema schema file to validate the PMO against
:param validate_pmo: whether to validate the PMO with a jsonschema
:return: pandas dataframe
"""
# check input
if validate_pmo:
with open(jsonschema_fnp) as f:
checker = PMOChecker(json.load(f))
checker.validate_pmo_json(pmodata)
# Check to see if at least 1 sample has supplied meta field
# samples without this meta field will have NA
if additional_specimen_info_fields is not None:
# Find meta fields that have at least some data
meta_fields_with_data = {
metafield
for metafield in additional_specimen_info_fields
for specimen_data in pmodata["specimen_info"]
if metafield in specimen_data
}
# Determine meta fields with no samples having data
meta_fields_with_no_samples = (
set(additional_specimen_info_fields) - meta_fields_with_data
)
if meta_fields_with_no_samples:
raise Exception(
f"No specimen_info have data for fields: {', '.join(meta_fields_with_no_samples)}"
)
# Check to see if at least 1 sample has supplied meta field
# samples without this meta field will have NA
if additional_library_sample_info_fields is not None:
# Find meta fields that have at least some data
meta_fields_with_data = {
metafield
for metafield in additional_library_sample_info_fields
for library_data in pmodata["library_sample_info"]
if metafield in library_data
}
# Determine meta fields with no samples having data
meta_fields_with_no_samples = (
set(additional_library_sample_info_fields) - meta_fields_with_data
)
if meta_fields_with_no_samples:
raise Exception(
f"No library_sample_info have data for fields: {', '.join(meta_fields_with_no_samples)}"
)
# Check to see if at least 1 haplotype has this field
# samples without this meta field will have NA
if additional_microhap_fields is not None:
# Find meta fields that have at least some data
additional_microhap_fields_with_data = {
additional_microhap_field
for additional_microhap_field in additional_microhap_fields
for detected_microhaplotypes in pmodata["detected_microhaplotypes"]
for library_samples_data in detected_microhaplotypes["library_samples"]
for target_data in library_samples_data["target_results"]
for microhap_data in target_data["mhaps"]
if additional_microhap_field in microhap_data
}
# Determine meta fields with no samples having data
additional_microhap_fields_with_no_samples = (
set(additional_microhap_fields) - additional_microhap_fields_with_data
)
if additional_microhap_fields_with_no_samples:
raise Exception(
f"No detected_microhaplotypes have data for fields: {', '.join(additional_microhap_fields_with_no_samples)}"
)
# Check to see if at least 1 haplotype has this field
# samples without this meta field will have NA
if additional_representative_info_fields is not None:
# Find meta fields that have at least some data
# not add seq as it's being added by default, so don't output twice
additional_microhap_fields_with_data = {
additional_microhap_field
for additional_microhap_field in additional_representative_info_fields
for target_data in pmodata["representative_microhaplotypes"]["targets"]
for microhap_data in target_data["microhaplotypes"]
if additional_microhap_field in microhap_data
and additional_microhap_field != "seq"
}
# Determine meta fields with no samples having data
additional_microhap_fields_with_no_samples = (
set(additional_representative_info_fields)
- additional_microhap_fields_with_data
)
if additional_microhap_fields_with_no_samples:
raise Exception(
f"No representative_microhaplotype_sequences have data for fields: {', '.join(additional_microhap_fields_with_no_samples)}"
)
if len(default_base_col_names) != 3:
raise Exception(
"Must have 3 default columns for allele counts, not {}".format(
len(default_base_col_names)
)
)
rows = []
specimen_info = pmodata["specimen_info"]
target_info = pmodata["target_info"]
library_sample_info = pmodata["library_sample_info"]
detected_microhaps = pmodata["detected_microhaplotypes"]
rep_haps = pmodata["representative_microhaplotypes"]["targets"]
bioinformatics_run_names = None
if "bioinformatics_run_info" in pmodata:
bioinformatics_run_names = PMOProcessor.get_bioinformatics_run_names(
pmodata
)
detected_microhaplotypes_count = 0
for bio_run_for_detected_microhaps in detected_microhaps:
bioinformatics_run_id = None
if "bioinformatics_run_id" in bio_run_for_detected_microhaps:
bioinformatics_run_id = bio_run_for_detected_microhaps[
"bioinformatics_run_id"
]
for sample_data in bio_run_for_detected_microhaps["library_samples"]:
library_sample_id = sample_data["library_sample_id"]
specimen_id = library_sample_info[library_sample_id]["specimen_id"]
library_meta = library_sample_info[library_sample_id]
specimen_meta = specimen_info[specimen_id]
for target_data in sample_data["target_results"]:
target_name = target_info[
rep_haps[target_data["mhaps_target_id"]]["target_id"]
]["target_name"]
for microhap_data in target_data["mhaps"]:
allele_id = microhap_data["mhap_id"]
# print(rep_haps[target_data["mhaps_target_id"]])
rep_hap_meta = rep_haps[target_data["mhaps_target_id"]][
"microhaplotypes"
][allele_id]
row = {
default_base_col_names[0]: library_meta[
"library_sample_name"
],
default_base_col_names[1]: target_name,
default_base_col_names[2]: rep_hap_meta["seq"],
}
if (
bioinformatics_run_names is not None
and bioinformatics_run_id is not None
):
row["bioinformatics_run_name"] = bioinformatics_run_names[
bioinformatics_run_id
]
else:
row[
"bioinformatics_run_name"
] = f"detected_microhaplotypes_count_idx_{detected_microhaplotypes_count}"
if additional_library_sample_info_fields is not None:
for field in additional_library_sample_info_fields:
row[field] = library_meta.get(field, "NA")
if additional_specimen_info_fields is not None:
for field in additional_specimen_info_fields:
row[field] = specimen_meta.get(field, "NA")
if additional_microhap_fields is not None:
for field in additional_microhap_fields:
row[field] = microhap_data.get(field, "NA")
if additional_representative_info_fields is not None:
for field in additional_representative_info_fields:
row[field] = rep_hap_meta.get(field, "NA")
rows.append(row)
detected_microhaplotypes_count += 1
# Build and return DataFrame
return pd.DataFrame(rows)
[docs] @staticmethod
def list_library_sample_names_per_specimen_name(
pmodata,
select_specimen_ids: list[int] = None,
select_specimen_names: list[str] = None,
) -> pd.DataFrame:
"""
List all the library_sample_names per specimen_name
:param pmodata: the PMO
:param select_specimen_ids: a list of specimen_ids to select, if None, all specimen_ids are used
:param select_specimen_names: a list of specimen_names to select, if None, all specimen_names are used
:return: a pandas dataframe with 3 columns, specimen_id, library_sample_id, and library_sample_id_count(the number of library_sample_ids per specimen_id)
"""
if select_specimen_ids is not None and select_specimen_names is not None:
raise ValueError(
"Cannot specify both select_specimen_ids and select_specimen_names"
)
lib_samples_per_spec = defaultdict(list[str])
if select_specimen_names is not None:
select_specimen_ids = PMOProcessor.get_index_of_specimen_names(
pmodata, select_specimen_names
)
for lib_sample in pmodata["library_sample_info"]:
if (
select_specimen_ids is None
or lib_sample["specimen_id"] in select_specimen_ids
):
lib_samples_per_spec[
pmodata["specimen_info"][lib_sample["specimen_id"]]["specimen_name"]
].append(lib_sample["library_sample_name"])
specimens_not_list = []
for specimen in pmodata["specimen_info"]:
if specimen["specimen_name"] not in lib_samples_per_spec:
specimens_not_list.append(specimen["specimen_name"])
# Prepare the data for DataFrame creation
data = []
for specimen_name, library_sample_names in lib_samples_per_spec.items():
for library_sample_name in library_sample_names:
data.append(
{
"specimen_name": specimen_name,
"library_sample_name": library_sample_name,
"library_sample_count": len(library_sample_names),
}
)
# Create the DataFrame
df = pd.DataFrame(
data,
columns=["specimen_name", "library_sample_name", "library_sample_count"],
)
return df
@staticmethod
def _write_sheet(writer: pd.ExcelWriter, config: "PMOExporter.SheetConfig") -> None:
"""Write a single DataFrame to an Excel sheet and autofit its columns."""
config.df.to_excel(writer, sheet_name=config.sheet_name, index=False)
PMOExporter._autofit_columns(
writer,
config.sheet_name,
config.df,
specific_cols=config.specific_cols,
max_row_check=config.max_row_check,
)
@staticmethod
def _autofit_columns(
writer: pd.ExcelWriter,
sheet_name: str,
df: pd.DataFrame,
specific_cols: list[str] | None = None,
max_row_check: int | None = None,
) -> None:
"""
Auto-adjusts column widths in an Excel worksheet based on content length.
Args:
writer: The active ExcelWriter instance (post df.to_excel call).
sheet_name: The name of the worksheet to adjust.
df: The DataFrame that was written to the sheet.
specific_cols: Optional list of column names to adjust. If None, all columns are adjusted.
max_row_check: Optional max number of rows to sample when calculating width.
Always includes the header regardless of this value.
If None, all rows are checked.
"""
worksheet = writer.sheets[sheet_name]
columns = specific_cols if specific_cols is not None else list(df.columns)
for column in columns:
col_idx = df.columns.get_loc(column) + 1
sample = (
df[column] if max_row_check is None else df[column].iloc[:max_row_check]
)
max_length = max(
sample.astype(str).map(len).max() if len(sample) > 0 else 0,
len(str(column)),
)
col_letter = get_column_letter(col_idx)
worksheet.column_dimensions[col_letter].width = max_length + 1
[docs] @dataclass
class SheetConfig:
"""Configuration for writing a DataFrame to an Excel sheet."""
sheet_name: str
df: pd.DataFrame
max_row_check: int | None = None
specific_cols: list[str] | None = None
@staticmethod
def _build_pmo_sheet_configs(pmo) -> "list[PMOExporter.SheetConfig]":
"""
Build the ordered list of SheetConfigs to export from a PMO object.
Optional sheets are included only if their key is present in pmo.
"""
sheet_conf = PMOExporter.SheetConfig
sheets = [
sheet_conf("PMO Header", PMOExporter.export_pmo_header_table(pmo)),
sheet_conf(
"Required Panel Targets", PMOExporter.export_target_info_meta_table(pmo)
),
sheet_conf(
"Required Panel Info", PMOExporter.export_panel_info_meta_table(pmo)
),
]
if "targeted_genomes" in pmo:
sheets.append(
sheet_conf(
"Optional GenomeInfo",
PMOExporter.export_targeted_genomes_meta_table(pmo),
)
)
sheets.append(
sheet_conf(
"Required Microhaplotype",
# @todo add in the optional fields of the detected_microhaplotypes and representative_microhaplotypes
PMOExporter.extract_alleles_per_sample_table(
pmo, additional_microhap_fields=["reads"]
),
max_row_check=10,
)
)
sheets.append(
sheet_conf(
"Optional Specimen Level",
PMOExporter.export_specimen_meta_table(pmo),
max_row_check=10,
)
)
sheets.append(
sheet_conf(
"Optional LibrarySampleInfo",
PMOExporter.export_library_sample_meta_table(pmo),
max_row_check=10,
)
)
if "project_info" in pmo:
sheets.append(
sheet_conf(
"Optional ProjectInfo",
PMOExporter.export_project_info_meta_table(pmo),
)
)
if "sequencing_info" in pmo:
sheets.append(
sheet_conf(
"Optional SequencingInfo",
PMOExporter.export_sequencing_info_meta_table(pmo),
)
)
if "bioinformatics_methods_info" in pmo:
sheets.append(
sheet_conf(
"Optional Bioinformatics Methods",
PMOExporter.export_bioinformatics_methods_info_meta_table(pmo),
)
)
if "bioinformatics_run_info" in pmo:
sheets.append(
sheet_conf(
"Optional Bioinformatics Run",
PMOExporter.export_bioinformatics_run_info_meta_table(pmo),
)
)
return sheets
[docs] @staticmethod
def export_to_excel(pmo, output_path: str) -> None:
"""
Export a PMO object to a multi-sheet Excel file.
Args:
pmo: The PMO object to export.
output_path: The path to write the Excel file to.
"""
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
for config in PMOExporter._build_pmo_sheet_configs(pmo):
PMOExporter._write_sheet(writer, config)