Source code for pmotools.pmo_engine.pmo_exporter

#!/usr/bin/env python3
import copy
import json
import os
from collections import defaultdict
from typing import NamedTuple
import pandas as pd
from openpyxl.utils import get_column_letter
from dataclasses import dataclass

from pmotools.pmo_engine.pmo_checker import PMOChecker
from pmotools.pmo_engine.pmo_processor import PMOProcessor

from pmotools import __version__ as __pmotools_version__


[docs]class BedLoc(NamedTuple): """ A single BED-format genomic location. Used when extracting target / panel insert locations out of a PMO so they can be written to a BED file. :ivar chrom: chromosome / contig name :ivar start: 0-based start position :ivar end: end position (exclusive) :ivar name: target name :ivar score: BED score column; here the insert length (``end - start``) :ivar strand: ``+`` or ``-`` :ivar ref_seq: reference sequence for the insert, empty string if not loaded :ivar extra_info: free-text key/value annotation, e.g. genome name/version """ chrom: str start: int end: int name: str score: float strand: str ref_seq: str extra_info: str
[docs]class PMOExporter(object): """ A collection of functions to export information out of a PMO """ @staticmethod def _is_primitive(x) -> bool: """ Check whether a value is a primitive that can be written directly to a table cell. :param x: the value to check :return: True if ``x`` is a str, int, float, bool, or None """ return isinstance(x, (str, int, float, bool)) or x is None @staticmethod def _is_primitive_list(x) -> bool: """ Check whether a value is a list or tuple containing only primitives. :param x: the value to check :return: True if ``x`` is a list/tuple and every element is a primitive (see :meth:`is_primitive`) """ return isinstance(x, (list, tuple)) and all( PMOExporter._is_primitive(i) for i in x ) @staticmethod def _is_exportable(x) -> bool: """ Check whether a value can be exported to a flat table. A value is exportable if it is a primitive or a list/tuple of primitives; complex nested objects (e.g. TravelInfo, parasite densities) are not. :param x: the value to check :return: True if ``x`` is a primitive or a primitive list """ return PMOExporter._is_primitive(x) or PMOExporter._is_primitive_list(x)
[docs] @staticmethod def export_specimen_travel_meta_table( pmodata, separator: str = "," ) -> pd.DataFrame: """ Export the specimen meta information of a PMO to a dataframe Currently avoiding exporting values of complex object types like TravelInfo or Parasite densities, best to export such values in their own tables :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the specimen metadata """ rows = [] for specimen in pmodata["specimen_info"]: if "travel_out_six_month" in specimen: for travel_meta in specimen["travel_out_six_month"]: export_row = {"specimen_name": specimen["specimen_name"]} for key, value in travel_meta.items(): if PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) return pd.DataFrame(rows)
[docs] @staticmethod def export_specimen_meta_table(pmodata, separator: str = ",") -> pd.DataFrame: """ Export the specimen meta information of a PMO to a dataframe Currently avoiding exporting values of complex object types like TravelInfo or Parasite densities, best to export such values in their own tables :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the specimen metadata """ rows = [] for specimen in pmodata["specimen_info"]: export_row = {} for key, value in specimen.items(): if "project_id" == key: export_row["project_name"] = pmodata["project_info"][value][ "project_name" ] elif PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) return pd.DataFrame(rows)
[docs] @staticmethod def export_library_sample_meta_table(pmodata, separator: str = ",") -> pd.DataFrame: """ Export the library_sample meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the library_sample metadata """ rows = [] for library_sample in pmodata["library_sample_info"]: export_row = {} for key, value in library_sample.items(): if "sequencing_info_id" == key: export_row["sequencing_info_name"] = pmodata["sequencing_info"][ value ]["sequencing_info_name"] elif "specimen_id" == key: export_row["specimen_name"] = pmodata["specimen_info"][value][ "specimen_name" ] elif "panel_id" == key: export_row["panel_name"] = pmodata["panel_info"][value][ "panel_name" ] elif PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) return pd.DataFrame(rows)
[docs] @staticmethod def export_bioinformatics_run_info_meta_table( pmodata, separator: str = "," ) -> pd.DataFrame: """ Export the bioinformatics_run_info meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the library_sample metadata """ rows = [] if "bioinformatics_run_info" not in pmodata.keys(): raise ValueError("no bioinformatics_run_info found in input PMO") run_id = 0 for bioinformatics_run_info in pmodata["bioinformatics_run_info"]: export_row = {} export_row["run_id"] = run_id run_id += 1 for key, value in bioinformatics_run_info.items(): if PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) return pd.DataFrame(rows)
[docs] @staticmethod def export_bioinformatics_methods_info_meta_table( pmodata, separator: str = "," ) -> pd.DataFrame: """ Export the bioinformatics_methods_info meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the library_sample metadata """ # @todo write pytest export_bioinformatics_methods_info_meta_table rows = [] if "bioinformatics_methods_info" not in pmodata.keys(): raise ValueError("no bioinformatics_methods_info found in input PMO") bioinformatics_methods_id = 0 for bioinformatics_methods_info in pmodata["bioinformatics_methods_info"]: bioinformatics_methods_id += 1 export_row = {} for key, value in bioinformatics_methods_info.items(): export_row["bioinformatics_methods_id"] = bioinformatics_methods_id if PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) method_count = 0 for method in bioinformatics_methods_info["methods"]: method_count += 1 method_export_row = copy.deepcopy(export_row) method_export_row["method_id"] = method_count for method_key in method.keys(): method_export_row[method_key] = method[method_key] rows.append(method_export_row) return pd.DataFrame(rows)
[docs] @staticmethod def export_sequencing_info_meta_table( pmodata, separator: str = "," ) -> pd.DataFrame: """ Export the sequencing_info meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the sequencing_info metadata """ # check to see sequencing_info is loaded if "sequencing_info" not in pmodata.keys(): raise ValueError("no sequencing_info found in input PMO") rows = [] for sequencing_info in pmodata["sequencing_info"]: export_row = {} for key, value in sequencing_info.items(): if PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) return pd.DataFrame(rows)
[docs] @staticmethod def export_project_info_meta_table(pmodata, separator: str = ",") -> pd.DataFrame: """ Export the project_info meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the project_info metadata """ # check to see sequencing_info is loaded if "project_info" not in pmodata.keys(): raise ValueError("no project_info found in input PMO") rows = [] for project_info in pmodata["project_info"]: export_row = {} for key, value in project_info.items(): if PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) return pd.DataFrame(rows)
[docs] @staticmethod def export_panel_info_meta_table(pmodata, separator: str = ",") -> pd.DataFrame: """ Export the panel meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the panel metadata """ rows = [] for panel_info in pmodata["panel_info"]: export_row = {} for key, value in panel_info.items(): if PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) reactions_for_target = defaultdict(list) for reaction in panel_info["reactions"]: for target_id in reaction["panel_targets"]: reactions_for_target[ pmodata["target_info"][target_id]["target_name"] ].append(reaction["reaction_name"]) for target, reactions in reactions_for_target.items(): export_row_per_target = copy.deepcopy(export_row) export_row_per_target["target_name"] = target export_row_per_target["reaction_name"] = separator.join(reactions) rows.append(export_row_per_target) return pd.DataFrame(rows)
[docs] @staticmethod def export_target_info_meta_table(pmodata, separator: str = ",") -> pd.DataFrame: """ Export the target meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the panel metadata """ rows = [] for target_info in pmodata["target_info"]: export_row = {} for key, value in target_info.items(): if "forward_primer" == key: export_row["forward_primer_seq"] = target_info["forward_primer"][ "seq" ] if "location" in target_info["forward_primer"]: for primer_loc_key in target_info["forward_primer"][ "location" ].keys(): export_row[ "forward_primer_" + primer_loc_key ] = target_info["forward_primer"]["location"][ primer_loc_key ] elif "reverse_primer" == key: export_row["reverse_primer_seq"] = target_info["reverse_primer"][ "seq" ] if "location" in target_info["reverse_primer"]: for primer_loc_key in target_info["reverse_primer"][ "location" ].keys(): export_row[ "reverse_primer_" + primer_loc_key ] = target_info["reverse_primer"]["location"][ primer_loc_key ] elif "insert_location" == key: for insert_key in value.keys(): export_row["insert_" + insert_key] = value[insert_key] elif PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) df = pd.DataFrame(rows) priority_cols = ["target_name", "forward_primer_seq", "reverse_primer_seq"] leading = [c for c in priority_cols if c in df.columns] rest = sorted(c for c in df.columns if c not in priority_cols) return df[leading + rest]
[docs] @staticmethod def export_pmo_header_table(pmodata, separator: str = ",") -> pd.DataFrame: """ Export the pmo header meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the genomes metadata """ rows = [] if "pmo_header" not in pmodata.keys(): raise ValueError("no pmo_header found in input PMO") export_row = {} for key, value in pmodata["pmo_header"].items(): if "generation_method" == key: export_row["generation_method.program_version"] = value[ "program_version" ] export_row["generation_method.program_name"] = value["program_name"] elif PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) df = pd.DataFrame(rows) priority_cols = ["pmo_version"] leading = [c for c in priority_cols if c in df.columns] rest = sorted(c for c in df.columns if c not in priority_cols) return df[leading + rest]
[docs] @staticmethod def export_targeted_genomes_meta_table( pmodata, separator: str = "," ) -> pd.DataFrame: """ Export the targeted genomes meta information of a PMO to a dataframe :param pmodata: the pmo export the information from :param separator: the separator to use for list values :return: a pandas dataframe of the genomes metadata """ rows = [] genome_id = 0 if "targeted_genomes" not in pmodata.keys(): raise ValueError("no targeted_genomes found in input PMO") for genome_info in pmodata["targeted_genomes"]: export_row = {} export_row["genome_id"] = genome_id genome_id += 1 for key, value in genome_info.items(): if PMOExporter._is_primitive(value): export_row[key] = value elif PMOExporter._is_primitive_list(value): export_row[key] = separator.join(str(v) for v in value) rows.append(export_row) df = pd.DataFrame(rows) priority_cols = ["name", "genome_version", "taxon_id", "genome_id", "url"] leading = [c for c in priority_cols if c in df.columns] rest = sorted(c for c in df.columns if c not in priority_cols) return df[leading + rest]
[docs] @staticmethod def write_bed_locs(bed_locs: list[BedLoc], fnp, add_header: bool = False): """ Write out a list of BedLoc to a file, will auto overwrite it :param bed_locs: a list of BedLoc :param fnp: output file path, will be overwritten if it exists :param add_header: add header of #chrom,start end,name,score,strand,ref_seq,extra_info, starts with comment so tools will treat it as a comment line """ with open(fnp, "w") as f: if add_header: f.write( "\t".join( [ "#chrom", "start", "end", "name", "score", "strand", "ref_seq", "extra_info", ] ) ) for bed_loc in bed_locs: f.write( "\t".join( [ bed_loc.chrom, str(bed_loc.start), str(bed_loc.end), bed_loc.name, str(bed_loc.score), bed_loc.strand, str(bed_loc.ref_seq), bed_loc.extra_info, ] ) ) f.write("\n")
[docs] @staticmethod def extract_targets_insert_bed_loc( pmodata, select_target_ids: list[int] = None, sort_output: bool = True ): """ Extract out of a PMO the insert location for targets, will add ref seq if loaded into PMO :param pmodata: the PMO to extract from :param select_target_ids: a list of target ids to select, if None will select all targets :param sort_output: whether to sort output by genomic location :return: a list of target inserts, with named tuples with fields: chrom, start, end, name, score, strand, ref_seq, extra_info """ bed_loc_out = [] if select_target_ids is None: select_target_ids = list(range(len(pmodata["target_info"]))) for target_id in select_target_ids: tar = pmodata["target_info"][target_id] if "insert_location" not in tar: raise Exception( "no insert_location in pmodata for target id " + str(target_id) + " target_name " + str(tar["target_name"]) + ", cannot extract insert_location" ) genome_info = pmodata["targeted_genomes"][ tar["insert_location"]["genome_id"] ] genome_name_version = ( genome_info["name"] + "_" + genome_info["genome_version"] ) extra_info = ( str("[") + str("genome_name_version=") + genome_name_version + ";]" ) strand = ( "+" if "strand" not in tar["insert_location"] else tar["insert_location"]["strand"] ) ref_seq = ( "" if "ref_seq" not in tar["insert_location"] else tar["insert_location"]["ref_seq"] ) bed_loc_out.append( BedLoc( tar["insert_location"]["chrom"], tar["insert_location"]["start"], tar["insert_location"]["end"], tar["target_name"], tar["insert_location"]["end"] - tar["insert_location"]["start"], strand, ref_seq, extra_info, ) ) if sort_output: return sorted(bed_loc_out, key=lambda bed: (bed.chrom, bed.start, bed.end)) return bed_loc_out
[docs] @staticmethod def extract_panels_insert_bed_loc( pmodata, select_panel_ids: list[int] = None, sort_output: bool = True ): """ Extract out of a PMO the insert location for panels, will add ref seq if loaded into PMO :param pmodata: the PMO to extract from :param select_panel_ids: a list of panels ids to select, if None will select all panels :param sort_output: whether to sort output by genomic location :return: a list of target inserts, with named tuples with fields: chrom, start, end, name, score, strand, ref_seq, extra_info """ bed_loc_out = {} if select_panel_ids is None: select_panel_ids = list(range(len(pmodata["panel_info"]))) for panel_id in select_panel_ids: bed_loc_out_per_panel = [] for reaction_id in range(len(pmodata["panel_info"][panel_id]["reactions"])): for target_id in pmodata["panel_info"][panel_id]["reactions"][ reaction_id ]["panel_targets"]: tar = pmodata["target_info"][target_id] if "insert_location" not in tar: raise Exception( "no insert_location in pmodata for target id " + str(target_id) + " target_name " + str(tar["target_name"]) + ", cannot extract insert_location" ) genome_info = pmodata["targeted_genomes"][ tar["insert_location"]["genome_id"] ] genome_name_version = ( genome_info["name"] + "_" + genome_info["genome_version"] ) extra_info = ( str("[") + "genome_name_version=" + genome_name_version + ";" + "panel=" + pmodata["panel_info"][panel_id]["panel_name"] + ";" + "reaction=" + pmodata["panel_info"][panel_id]["reactions"][reaction_id][ "reaction_name" ] + ";" + "]" ) strand = ( "+" if "strand" not in tar["insert_location"] else tar["insert_location"]["strand"] ) ref_seq = ( "" if "ref_seq" not in tar["insert_location"] else tar["insert_location"]["ref_seq"] ) bed_loc_out_per_panel.append( BedLoc( tar["insert_location"]["chrom"], tar["insert_location"]["start"], tar["insert_location"]["end"], tar["target_name"], tar["insert_location"]["end"] - tar["insert_location"]["start"], strand, ref_seq, extra_info, ) ) if sort_output: return sorted( bed_loc_out_per_panel, key=lambda bed: (bed.chrom, bed.start, bed.end), ) bed_loc_out[panel_id] = bed_loc_out_per_panel return bed_loc_out
[docs] @staticmethod def extract_alleles_per_sample_table( pmodata, additional_specimen_info_fields: list[str] = None, additional_library_sample_info_fields: list[str] = None, additional_microhap_fields: list[str] = None, additional_representative_info_fields: list[str] = None, default_base_col_names: list[str] = [ "library_sample_name", "target_name", "seq", ], jsonschema_fnp=os.path.join( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ), "schemas/", f"portable_microhaplotype_object_v{__pmotools_version__}.schema.json", ), validate_pmo: bool = False, ) -> pd.DataFrame: """ Create a pd.Dataframe of sample, target and allele. Can optionally add on any other additional fields :param pmodata: the data to write from :param additional_specimen_info_fields: any additional fields to write from the specimen_info object :param additional_library_sample_info_fields: any additional fields to write from the library_samples object :param additional_microhap_fields: any additional fields to write from the microhap object :param additional_representative_info_fields: any additional fields to write from the representative_microhaplotype_sequences object :param default_base_col_names: The default column name for the library_sample_name, target_name and seq :param jsonschema_fnp: path to the jsonschema schema file to validate the PMO against :param validate_pmo: whether to validate the PMO with a jsonschema :return: pandas dataframe """ # check input if validate_pmo: with open(jsonschema_fnp) as f: checker = PMOChecker(json.load(f)) checker.validate_pmo_json(pmodata) # Check to see if at least 1 sample has supplied meta field # samples without this meta field will have NA if additional_specimen_info_fields is not None: # Find meta fields that have at least some data meta_fields_with_data = { metafield for metafield in additional_specimen_info_fields for specimen_data in pmodata["specimen_info"] if metafield in specimen_data } # Determine meta fields with no samples having data meta_fields_with_no_samples = ( set(additional_specimen_info_fields) - meta_fields_with_data ) if meta_fields_with_no_samples: raise Exception( f"No specimen_info have data for fields: {', '.join(meta_fields_with_no_samples)}" ) # Check to see if at least 1 sample has supplied meta field # samples without this meta field will have NA if additional_library_sample_info_fields is not None: # Find meta fields that have at least some data meta_fields_with_data = { metafield for metafield in additional_library_sample_info_fields for library_data in pmodata["library_sample_info"] if metafield in library_data } # Determine meta fields with no samples having data meta_fields_with_no_samples = ( set(additional_library_sample_info_fields) - meta_fields_with_data ) if meta_fields_with_no_samples: raise Exception( f"No library_sample_info have data for fields: {', '.join(meta_fields_with_no_samples)}" ) # Check to see if at least 1 haplotype has this field # samples without this meta field will have NA if additional_microhap_fields is not None: # Find meta fields that have at least some data additional_microhap_fields_with_data = { additional_microhap_field for additional_microhap_field in additional_microhap_fields for detected_microhaplotypes in pmodata["detected_microhaplotypes"] for library_samples_data in detected_microhaplotypes["library_samples"] for target_data in library_samples_data["target_results"] for microhap_data in target_data["mhaps"] if additional_microhap_field in microhap_data } # Determine meta fields with no samples having data additional_microhap_fields_with_no_samples = ( set(additional_microhap_fields) - additional_microhap_fields_with_data ) if additional_microhap_fields_with_no_samples: raise Exception( f"No detected_microhaplotypes have data for fields: {', '.join(additional_microhap_fields_with_no_samples)}" ) # Check to see if at least 1 haplotype has this field # samples without this meta field will have NA if additional_representative_info_fields is not None: # Find meta fields that have at least some data # not add seq as it's being added by default, so don't output twice additional_microhap_fields_with_data = { additional_microhap_field for additional_microhap_field in additional_representative_info_fields for target_data in pmodata["representative_microhaplotypes"]["targets"] for microhap_data in target_data["microhaplotypes"] if additional_microhap_field in microhap_data and additional_microhap_field != "seq" } # Determine meta fields with no samples having data additional_microhap_fields_with_no_samples = ( set(additional_representative_info_fields) - additional_microhap_fields_with_data ) if additional_microhap_fields_with_no_samples: raise Exception( f"No representative_microhaplotype_sequences have data for fields: {', '.join(additional_microhap_fields_with_no_samples)}" ) if len(default_base_col_names) != 3: raise Exception( "Must have 3 default columns for allele counts, not {}".format( len(default_base_col_names) ) ) rows = [] specimen_info = pmodata["specimen_info"] target_info = pmodata["target_info"] library_sample_info = pmodata["library_sample_info"] detected_microhaps = pmodata["detected_microhaplotypes"] rep_haps = pmodata["representative_microhaplotypes"]["targets"] bioinformatics_run_names = None if "bioinformatics_run_info" in pmodata: bioinformatics_run_names = PMOProcessor.get_bioinformatics_run_names( pmodata ) detected_microhaplotypes_count = 0 for bio_run_for_detected_microhaps in detected_microhaps: bioinformatics_run_id = None if "bioinformatics_run_id" in bio_run_for_detected_microhaps: bioinformatics_run_id = bio_run_for_detected_microhaps[ "bioinformatics_run_id" ] for sample_data in bio_run_for_detected_microhaps["library_samples"]: library_sample_id = sample_data["library_sample_id"] specimen_id = library_sample_info[library_sample_id]["specimen_id"] library_meta = library_sample_info[library_sample_id] specimen_meta = specimen_info[specimen_id] for target_data in sample_data["target_results"]: target_name = target_info[ rep_haps[target_data["mhaps_target_id"]]["target_id"] ]["target_name"] for microhap_data in target_data["mhaps"]: allele_id = microhap_data["mhap_id"] # print(rep_haps[target_data["mhaps_target_id"]]) rep_hap_meta = rep_haps[target_data["mhaps_target_id"]][ "microhaplotypes" ][allele_id] row = { default_base_col_names[0]: library_meta[ "library_sample_name" ], default_base_col_names[1]: target_name, default_base_col_names[2]: rep_hap_meta["seq"], } if ( bioinformatics_run_names is not None and bioinformatics_run_id is not None ): row["bioinformatics_run_name"] = bioinformatics_run_names[ bioinformatics_run_id ] else: row[ "bioinformatics_run_name" ] = f"detected_microhaplotypes_count_idx_{detected_microhaplotypes_count}" if additional_library_sample_info_fields is not None: for field in additional_library_sample_info_fields: row[field] = library_meta.get(field, "NA") if additional_specimen_info_fields is not None: for field in additional_specimen_info_fields: row[field] = specimen_meta.get(field, "NA") if additional_microhap_fields is not None: for field in additional_microhap_fields: row[field] = microhap_data.get(field, "NA") if additional_representative_info_fields is not None: for field in additional_representative_info_fields: row[field] = rep_hap_meta.get(field, "NA") rows.append(row) detected_microhaplotypes_count += 1 # Build and return DataFrame return pd.DataFrame(rows)
[docs] @staticmethod def list_library_sample_names_per_specimen_name( pmodata, select_specimen_ids: list[int] = None, select_specimen_names: list[str] = None, ) -> pd.DataFrame: """ List all the library_sample_names per specimen_name :param pmodata: the PMO :param select_specimen_ids: a list of specimen_ids to select, if None, all specimen_ids are used :param select_specimen_names: a list of specimen_names to select, if None, all specimen_names are used :return: a pandas dataframe with 3 columns, specimen_id, library_sample_id, and library_sample_id_count(the number of library_sample_ids per specimen_id) """ if select_specimen_ids is not None and select_specimen_names is not None: raise ValueError( "Cannot specify both select_specimen_ids and select_specimen_names" ) lib_samples_per_spec = defaultdict(list[str]) if select_specimen_names is not None: select_specimen_ids = PMOProcessor.get_index_of_specimen_names( pmodata, select_specimen_names ) for lib_sample in pmodata["library_sample_info"]: if ( select_specimen_ids is None or lib_sample["specimen_id"] in select_specimen_ids ): lib_samples_per_spec[ pmodata["specimen_info"][lib_sample["specimen_id"]]["specimen_name"] ].append(lib_sample["library_sample_name"]) specimens_not_list = [] for specimen in pmodata["specimen_info"]: if specimen["specimen_name"] not in lib_samples_per_spec: specimens_not_list.append(specimen["specimen_name"]) # Prepare the data for DataFrame creation data = [] for specimen_name, library_sample_names in lib_samples_per_spec.items(): for library_sample_name in library_sample_names: data.append( { "specimen_name": specimen_name, "library_sample_name": library_sample_name, "library_sample_count": len(library_sample_names), } ) # Create the DataFrame df = pd.DataFrame( data, columns=["specimen_name", "library_sample_name", "library_sample_count"], ) return df
@staticmethod def _write_sheet(writer: pd.ExcelWriter, config: "PMOExporter.SheetConfig") -> None: """Write a single DataFrame to an Excel sheet and autofit its columns.""" config.df.to_excel(writer, sheet_name=config.sheet_name, index=False) PMOExporter._autofit_columns( writer, config.sheet_name, config.df, specific_cols=config.specific_cols, max_row_check=config.max_row_check, ) @staticmethod def _autofit_columns( writer: pd.ExcelWriter, sheet_name: str, df: pd.DataFrame, specific_cols: list[str] | None = None, max_row_check: int | None = None, ) -> None: """ Auto-adjusts column widths in an Excel worksheet based on content length. Args: writer: The active ExcelWriter instance (post df.to_excel call). sheet_name: The name of the worksheet to adjust. df: The DataFrame that was written to the sheet. specific_cols: Optional list of column names to adjust. If None, all columns are adjusted. max_row_check: Optional max number of rows to sample when calculating width. Always includes the header regardless of this value. If None, all rows are checked. """ worksheet = writer.sheets[sheet_name] columns = specific_cols if specific_cols is not None else list(df.columns) for column in columns: col_idx = df.columns.get_loc(column) + 1 sample = ( df[column] if max_row_check is None else df[column].iloc[:max_row_check] ) max_length = max( sample.astype(str).map(len).max() if len(sample) > 0 else 0, len(str(column)), ) col_letter = get_column_letter(col_idx) worksheet.column_dimensions[col_letter].width = max_length + 1
[docs] @dataclass class SheetConfig: """Configuration for writing a DataFrame to an Excel sheet.""" sheet_name: str df: pd.DataFrame max_row_check: int | None = None specific_cols: list[str] | None = None
@staticmethod def _build_pmo_sheet_configs(pmo) -> "list[PMOExporter.SheetConfig]": """ Build the ordered list of SheetConfigs to export from a PMO object. Optional sheets are included only if their key is present in pmo. """ sheet_conf = PMOExporter.SheetConfig sheets = [ sheet_conf("PMO Header", PMOExporter.export_pmo_header_table(pmo)), sheet_conf( "Required Panel Targets", PMOExporter.export_target_info_meta_table(pmo) ), sheet_conf( "Required Panel Info", PMOExporter.export_panel_info_meta_table(pmo) ), ] if "targeted_genomes" in pmo: sheets.append( sheet_conf( "Optional GenomeInfo", PMOExporter.export_targeted_genomes_meta_table(pmo), ) ) sheets.append( sheet_conf( "Required Microhaplotype", # @todo add in the optional fields of the detected_microhaplotypes and representative_microhaplotypes PMOExporter.extract_alleles_per_sample_table( pmo, additional_microhap_fields=["reads"] ), max_row_check=10, ) ) sheets.append( sheet_conf( "Optional Specimen Level", PMOExporter.export_specimen_meta_table(pmo), max_row_check=10, ) ) sheets.append( sheet_conf( "Optional LibrarySampleInfo", PMOExporter.export_library_sample_meta_table(pmo), max_row_check=10, ) ) if "project_info" in pmo: sheets.append( sheet_conf( "Optional ProjectInfo", PMOExporter.export_project_info_meta_table(pmo), ) ) if "sequencing_info" in pmo: sheets.append( sheet_conf( "Optional SequencingInfo", PMOExporter.export_sequencing_info_meta_table(pmo), ) ) if "bioinformatics_methods_info" in pmo: sheets.append( sheet_conf( "Optional Bioinformatics Methods", PMOExporter.export_bioinformatics_methods_info_meta_table(pmo), ) ) if "bioinformatics_run_info" in pmo: sheets.append( sheet_conf( "Optional Bioinformatics Run", PMOExporter.export_bioinformatics_run_info_meta_table(pmo), ) ) return sheets
[docs] @staticmethod def export_to_excel(pmo, output_path: str) -> None: """ Export a PMO object to a multi-sheet Excel file. Args: pmo: The PMO object to export. output_path: The path to write the Excel file to. """ with pd.ExcelWriter(output_path, engine="openpyxl") as writer: for config in PMOExporter._build_pmo_sheet_configs(pmo): PMOExporter._write_sheet(writer, config)