#!/usr/bin/env python3
import pandas as pd
import json
from .json_convert_utils import remove_optional_null_values, check_null_values
[docs]def pandas_table_to_json(contents: pd.DataFrame, return_indexed_dict: bool = False):
"""
Convert a pandas dataframe table into a json dictionary, if there is an index column create a dictionary with the keys being the index
:param contents: the dataframe to be converted
:param return_indexed_dict: whether to return an indexed dictionary
:return: a dictionary of the input table data
"""
# Custom object_hook to replace None with an empty string
def custom_object_hook(d):
return {k: ("" if v is None else v) for k, v in d.items()}
if return_indexed_dict:
contents_json = json.loads(
contents.to_json(orient="index", index=True, date_format="iso"),
object_hook=custom_object_hook,
)
else:
contents_json = json.loads(
contents.to_json(orient="records", date_format="iso"),
object_hook=custom_object_hook,
)
return contents_json
[docs]def library_sample_info_table_to_pmo(
contents: pd.DataFrame,
library_sample_name_col: str = "library_sample_name",
specimen_name_col: str = "specimen_name",
panel_name_col: str = "panel_name",
sequencing_info_name_col: str = None,
alternate_identifiers_col: str = None,
experiment_accession_col: str = None,
fastqs_loc_col: str = None,
library_prep_plate_name_col: str = None,
library_prep_plate_col_col: str = None,
library_prep_plate_row_col: str = None,
library_prep_plate_position_col: str = None,
parasite_density_col: str = None,
parasite_density_method_col: str = None,
run_accession_col: str = None,
additional_library_sample_info_cols: list | None = None,
list_values_library_values: list | None = ["alternate_identifiers"],
list_values_library_values_delimiter: str = ",",
):
"""
Convert a DataFrame containing library information into JSON.
:param contents: input DataFrame containing library data
:type contents: pd.DataFrame
:param library_sample_name_col: column name for library sample names. Default: library_sample_name
:type library_sample_name_col: str
:param specimen_name_col: column name for specimen names. Default: specimen_name
:type specimen_name_col: str
:param panel_name_col: column name for panel names. Default: panel_name
:type panel_name_col: str
:param sequencing_info_name_col: column name for sequencing information names
:type sequencing_info_name_col: str, optional
:param alternate_identifiers_col: column name for alternate identifiers
:type alternate_identifiers_col: str, optional
:param experiment_accession_col: column name for experiment accession information
:type experiment_accession_col: str, optional
:param fastqs_loc_col: column name for location of fastqs
:type fastqs_loc_col: str, optional
:param library_prep_plate_name_col: column name containing plate name for sequencing
:type library_prep_plate_name_col: str, optional
:param library_prep_plate_col_col: column name for the column of the sample on the sequencing plate
:type library_prep_plate_col_col: str, optional
:param library_prep_plate_row_col: column name for the row of the sample on the sequencing plate
:type library_prep_plate_row_col: str, optional
:param library_prep_plate_position_col: column name for position on the sequencing plate (e.g. A01). Can't be set if library_prep_plate_col_col and library_prep_plate_row_col are specified.
:type library_prep_plate_position_col: str, optional
:param parasite_density_col: the parasite density in parasites per microliter
:type parasite_density_col: str or list of str, optional
:param parasite_density_method_col: the method of how the density was obtained. If set, parasite_density_col must also be specified.
:type parasite_density_method_col: str or list of str, optional
:param run_accession_col: column name for run accession information
:type run_accession_col: str, optional
:param additional_library_sample_info_cols: additional column names to include
:type additional_library_sample_info_cols: list of str, optional
:param list_values_library_values: columns that contain values that could be a list, delimited by list_values_library_values_delimiter
:type list_values_library_values: list of str, optional
:param list_values_library_values_delimiter: delimiter between list_values_library_values. Default: ','
:type list_values_library_values_delimiter: str
:return: JSON format where keys are ``library_sample_id`` and values are corresponding row data
:rtype: dict
"""
# Check contents is a dataframe
if not isinstance(contents, pd.DataFrame):
raise ValueError("contents must be a pandas DataFrame.")
copy_contents = contents.copy()
column_mapping = {library_sample_name_col: "library_sample_name"}
required_columns = list(column_mapping.keys())
recommended_columns = [specimen_name_col, panel_name_col, sequencing_info_name_col]
# Add optional columns
optional_column_mapping = {
specimen_name_col: "specimen_name",
panel_name_col: "panel_name",
sequencing_info_name_col: "sequencing_info_name",
alternate_identifiers_col: "alternate_identifiers",
experiment_accession_col: "experiment_accession",
fastqs_loc_col: "fastqs_loc",
run_accession_col: "run_accession",
}
column_mapping.update(
{k: v for k, v in optional_column_mapping.items() if k is not None}
)
# Include additional user-defined columns if provided
if additional_library_sample_info_cols:
for col in additional_library_sample_info_cols:
column_mapping[col] = col
# Checks on columns selected
check_unique_columns(
[
library_sample_name_col,
sequencing_info_name_col,
specimen_name_col,
panel_name_col,
alternate_identifiers_col,
experiment_accession_col,
fastqs_loc_col,
]
)
check_columns_exist(copy_contents, list(column_mapping.keys()))
# Check for null values in required columns and recommended columns
columns_to_check = required_columns
recommended_columns_present = list(
set(recommended_columns) & set(column_mapping.keys())
)
if len(recommended_columns_present) > 0:
columns_to_check.extend(recommended_columns_present)
check_null_values(copy_contents, columns_to_check)
# Rename and subset columns
selected_pmo_fields = list(column_mapping.values())
copy_contents = copy_contents.rename(columns=column_mapping)
subset_contents = copy_contents[selected_pmo_fields]
# Convert to format
meta_json = pandas_table_to_json(subset_contents)
meta_json = add_plate_info(
library_prep_plate_col_col,
library_prep_plate_name_col,
library_prep_plate_row_col,
library_prep_plate_position_col,
meta_json,
copy_contents,
"specimen_name",
"library_prep_plate_info",
)
meta_json = add_parasite_density_info(
parasite_density_col,
parasite_density_method_col,
meta_json,
copy_contents,
"library_sample_name",
entry_name="parasite_density_info",
)
# listify columns that contain values that could be list, are delimited by the argument list_values_library_values_delimiter
primitives = (int, float, str, bool, complex)
for col in list_values_library_values:
if col in copy_contents.columns:
for lib in meta_json:
if isinstance(lib[col], str):
lib[col] = lib[col].split(list_values_library_values_delimiter)
elif isinstance(lib[col], list):
pass
elif isinstance(lib[col], primitives):
lib[col] = [lib[col]]
else:
raise ValueError(
f"Column '{col}' must contain either strings or lists of strings."
)
meta_json = remove_optional_null_values(
meta_json, list(optional_column_mapping.values())
)
return meta_json
[docs]def specimen_info_table_to_pmo(
contents: pd.DataFrame,
specimen_name_col: str = "specimen_name",
specimen_taxon_id_col: str = None,
host_taxon_id_col: str = None,
collection_date_col: str = None,
collection_country_col: str = None,
project_name_col: str = None,
alternate_identifiers_col: str = None,
blood_meal_col: str = None,
drug_usage_col: str = None,
env_broad_scale_col: str = None,
env_local_scale_col: str = None,
env_medium_col: str = None,
geo_admin1_col: str = None,
geo_admin2_col: str = None,
geo_admin3_col: str = None,
gravid_col: str = None,
gravidity_col: str = None,
has_travel_out_six_month_col: str = None,
host_age_col: str = None,
host_sex_col: str = None,
host_subject_id: str = None,
lat_lon_col: str = None,
parasite_density_col: str = None,
parasite_density_method_col: str = None,
specimen_accession_col: str = None,
storage_plate_col_col: str = None,
storage_plate_name_col: str = None,
storage_plate_row_col: str = None,
storage_plate_position_col: str = None,
specimen_collect_device_col: str = None,
specimen_comments_col: str = None,
specimen_store_loc_col: str = None,
specimen_type_col: str = None,
treatment_status_col: str = None,
additional_specimen_cols: list | None = None,
list_values_specimen_values: list | None = [
"alternate_identifiers",
"drug_usage",
"specimen_comments",
"treatment_status",
"specimen_taxon_id",
],
list_values_specimen_values_delimiter: str = ",",
):
"""
Convert a DataFrame containing specimen information into JSON.
:param contents: the input DataFrame containing specimen data
:type contents: pd.DataFrame
:param specimen_name_col: the column name for specimen names. Default: specimen_name
:type specimen_name_col: str
:param specimen_taxon_id_col: NCBI taxonomy number of the organism
:type specimen_taxon_id_col: str, optional
:param host_taxon_id_col: NCBI taxonomy number of the host
:type host_taxon_id_col: str, optional
:param collection_date_col: date of the sample collection
:type collection_date_col: str, optional
:param collection_country_col: name of country collected in (admin level 0)
:type collection_country_col: str, optional
:param project_name_col: name of the project
:type project_name_col: str, optional
:param alternate_identifiers_col: list of optional alternative names for the samples
:type alternate_identifiers_col: str, optional
:param blood_meal_col: whether the host specimen has had a recent blood meal
:type blood_meal_col: str, optional
:param drug_usage_col: any drug used by the subject and the frequency of usage; can include multiple drugs used
:type drug_usage_col: str, optional
:param env_broad_scale_col: the broad environment from which the specimen was collected
:type env_broad_scale_col: str, optional
:param env_local_scale_col: the local environment from which the specimen was collected
:type env_local_scale_col: str, optional
:param env_medium_col: the environment medium from which the specimen was collected
:type env_medium_col: str, optional
:param geo_admin1_col: geographical admin level 1
:type geo_admin1_col: str, optional
:param geo_admin2_col: geographical admin level 2
:type geo_admin2_col: str, optional
:param geo_admin3_col: geographical admin level 3
:type geo_admin3_col: str, optional
:param gravid_col: whether the host specimen is pregnant
:type gravid_col: str, optional
:param gravidity_col: the number of previous pregnancies
:type gravidity_col: str, optional
:param has_travel_out_six_month_col: whether the host specimen has travelled out from the local region in the last six months
:type has_travel_out_six_month_col: str, optional
:param host_age_col: the age in years of the person
:type host_age_col: str, optional
:param host_sex_col: if the specimen is from a person, the sex of that person
:type host_sex_col: str, optional
:param host_subject_id: ID for the individual a specimen was collected from
:type host_subject_id: str, optional
:param lat_lon_col: latitude and longitude of the collection site
:type lat_lon_col: str, optional
:param parasite_density_col: the parasite density in parasites per microliter
:type parasite_density_col: str or list of str, optional
:param parasite_density_method_col: the method of how the density was obtained. If set, parasite_density_col must also be specified.
:type parasite_density_method_col: str or list of str, optional
:param specimen_accession_col: the accession number of the specimen
:type specimen_accession_col: str, optional
:param storage_plate_col_col: column the specimen was in on the plate. If set, storage_plate_row_col must also be specified.
:type storage_plate_col_col: str, optional
:param storage_plate_name_col: name of the plate the specimen was in
:type storage_plate_name_col: str, optional
:param storage_plate_row_col: row the specimen was in on the plate. If set, storage_plate_col_col must also be specified.
:type storage_plate_row_col: str, optional
:param storage_plate_position_col: position of the specimen on the plate (e.g. A01). Can't be set if storage_plate_col_col and storage_plate_row_col are specified.
:type storage_plate_position_col: str, optional
:param specimen_collect_device_col: the way the specimen was collected
:type specimen_collect_device_col: str, optional
:param specimen_comments_col: additional comments about the specimen
:type specimen_comments_col: str, optional
:param specimen_store_loc_col: specimen storage site
:type specimen_store_loc_col: str, optional
:param specimen_type_col: type of specimen, e.g. negative_control, positive_control, field_sample
:type specimen_type_col: str, optional
:param treatment_status_col: if the person has been treated with drugs, what the treatment outcome was
:type treatment_status_col: str, optional
:param additional_specimen_cols: additional column names to include
:type additional_specimen_cols: list of str, optional
:param list_values_specimen_values: columns that contain values that could be a list, delimited by list_values_specimen_values_delimiter
:type list_values_specimen_values: list of str, optional
:param list_values_specimen_values_delimiter: delimiter between list_values_specimen_values. Default: ','
:type list_values_specimen_values_delimiter: str
:return: JSON format where keys are ``specimen_name`` and values are corresponding row data
:rtype: dict
"""
# Check contents is a dataframe
if not isinstance(contents, pd.DataFrame):
raise ValueError("contents must be a pandas DataFrame.")
copy_contents = contents.copy()
column_mapping = {specimen_name_col: "specimen_name"}
required_columns = list(column_mapping.keys())
recommended_columns = [
specimen_taxon_id_col,
host_taxon_id_col,
collection_date_col,
collection_country_col,
project_name_col,
]
optional_column_mapping = {
specimen_taxon_id_col: "specimen_taxon_id",
host_taxon_id_col: "host_taxon_id",
collection_date_col: "collection_date",
collection_country_col: "collection_country",
project_name_col: "project_name",
alternate_identifiers_col: "alternate_identifiers",
drug_usage_col: "drug_usage",
blood_meal_col: "blood_meal",
gravid_col: "gravid",
gravidity_col: "gravidity",
has_travel_out_six_month_col: "has_travel_out_six_month",
env_broad_scale_col: "env_broad_scale",
env_local_scale_col: "env_local_scale",
env_medium_col: "env_medium",
geo_admin1_col: "geo_admin1",
geo_admin2_col: "geo_admin2",
geo_admin3_col: "geo_admin3",
host_age_col: "host_age",
host_sex_col: "host_sex",
host_subject_id: "host_subject_id",
lat_lon_col: "lat_lon",
specimen_accession_col: "specimen_accession",
specimen_type_col: "specimen_type",
treatment_status_col: "treatment_status",
specimen_collect_device_col: "specimen_collect_device",
specimen_comments_col: "specimen_comments",
specimen_store_loc_col: "specimen_store_loc",
}
column_mapping.update(
{k: v for k, v in optional_column_mapping.items() if k is not None}
)
# Include additional user-defined columns if provided
if additional_specimen_cols:
# selected_columns += additional_specimen_cols
for col in additional_specimen_cols:
column_mapping[col] = col
# Check column selection
check_unique_columns(
[
specimen_name_col,
specimen_taxon_id_col,
host_taxon_id_col,
collection_date_col,
collection_country_col,
project_name_col,
alternate_identifiers_col,
drug_usage_col,
env_broad_scale_col,
env_local_scale_col,
env_medium_col,
geo_admin1_col,
geo_admin2_col,
geo_admin3_col,
host_age_col,
host_sex_col,
host_subject_id,
lat_lon_col,
specimen_accession_col,
specimen_type_col,
treatment_status_col,
storage_plate_col_col,
storage_plate_name_col,
storage_plate_row_col,
storage_plate_position_col,
specimen_collect_device_col,
specimen_comments_col,
specimen_store_loc_col,
blood_meal_col,
gravid_col,
gravidity_col,
has_travel_out_six_month_col,
]
)
check_columns_exist(copy_contents, list(column_mapping.keys()))
# Check for null values in required columns and recommended columns
columns_to_check = required_columns
recommended_columns_present = list(
set(recommended_columns) & set(column_mapping.keys())
)
if len(recommended_columns_present) > 0:
columns_to_check.extend(recommended_columns_present)
check_null_values(copy_contents, columns_to_check)
# Rename and subset columns
selected_pmo_fields = list(column_mapping.values())
copy_contents = copy_contents.rename(columns=column_mapping)
subset_contents = copy_contents[selected_pmo_fields]
meta_json = pandas_table_to_json(subset_contents)
meta_json = add_parasite_density_info(
parasite_density_col,
parasite_density_method_col,
meta_json,
copy_contents,
"specimen_name",
entry_name="parasite_density_info",
)
meta_json = add_plate_info(
storage_plate_col_col,
storage_plate_name_col,
storage_plate_row_col,
storage_plate_position_col,
meta_json,
copy_contents,
"specimen_name",
entry_name="storage_plate_info",
)
# listify columns that contain values that could be list, are delimited by the argument list_values_specimen_values_delimiter
primitives = (int, float, str, bool, complex)
for col in list_values_specimen_values:
if col in copy_contents.columns:
for spec in meta_json:
if isinstance(spec[col], str):
spec[col] = spec[col].split(list_values_specimen_values_delimiter)
elif isinstance(spec[col], list):
pass
elif isinstance(spec[col], primitives):
spec[col] = [spec[col]]
else:
raise ValueError(
f"Column '{col}' must contain either strings or lists of strings."
)
meta_json = remove_optional_null_values(
meta_json, list(optional_column_mapping.values())
)
return meta_json
[docs]def check_unique_columns(columns):
cols_to_check = [col for col in columns if col is not None]
if len(cols_to_check) != len(set(cols_to_check)):
raise ValueError("Selected columns must be unique.")
[docs]def check_columns_exist(df, columns):
missing_cols = []
df_columns = df.columns
for col in columns:
if col not in df_columns:
missing_cols.append(col)
if missing_cols:
raise ValueError(
f"The following columns are not in the DataFrame: {missing_cols}"
)
[docs]def add_plate_info(
plate_col_col,
plate_name_col,
plate_row_col,
plate_position_col,
meta_json,
df,
specimen_name_col,
entry_name="plate_info",
):
if all(
col is None
for col in [plate_col_col, plate_name_col, plate_row_col, plate_position_col]
):
return meta_json
# If one of col or row are set both must be
if (plate_row_col is None) != (plate_col_col is None):
raise ValueError("If either plate row or column is set, then both must be.")
# Check position isn't specified in multiple ways
if plate_position_col:
if plate_col_col:
raise ValueError(
"Plate position can be specified using either row and col, or position, but not both."
)
else:
plate_row_col = "plate_row"
plate_col_col = "plate_col"
try:
df[plate_row_col] = (
df[plate_position_col].str.extract(r"(?i)^([A-H])")[0].str.upper()
)
df[plate_col_col] = (
df[plate_position_col]
.str.extract(r"(?i)^[A-H]0*([1-9]|1[0-2])$")[0]
.astype(int)
)
except (AttributeError, ValueError, IndexError, KeyError) as e:
raise ValueError(
f"Values in '{plate_position_col}' must start with a single letter A-H/a-h followed by number 1-12."
) from e
for row in meta_json:
content_row = df[df[specimen_name_col] == row[specimen_name_col]]
plate_name_val = content_row[plate_name_col].iloc[0] if plate_name_col else None
plate_row_val = (
content_row[plate_row_col].iloc[0].upper() if plate_row_col else None
)
plate_col_val = content_row[plate_col_col].iloc[0] if plate_col_col else None
if plate_col_val is not None and not pd.isna(plate_col_val):
try:
plate_col_val = int(plate_col_val)
except (TypeError, ValueError):
plate_col_val = plate_col_val
plate_info = {}
if plate_name_val:
plate_info["plate_name"] = plate_name_val
if plate_row_val:
plate_info["plate_row"] = plate_row_val
if plate_col_val is not None and not pd.isna(plate_col_val):
plate_info["plate_col"] = plate_col_val
if plate_info:
row[entry_name] = plate_info
return meta_json
[docs]def add_parasite_density_info(
parasite_density_col,
parasite_density_method_col,
meta_json,
df,
specimen_name_col,
entry_name,
):
density_method_pairs = []
if parasite_density_col is None and parasite_density_method_col is None:
pass
elif isinstance(parasite_density_col, list):
if parasite_density_method_col is None:
density_method_pairs = [(d_col, None) for d_col in parasite_density_col]
elif isinstance(parasite_density_method_col, list):
if len(parasite_density_col) != len(parasite_density_method_col):
raise ValueError(
"If both parasite_density_col and parasite_density_method_col are lists, they must be the same length."
)
density_method_pairs = list(
zip(parasite_density_col, parasite_density_method_col)
)
else:
raise TypeError(
"If parasite_density_col is a list, parasite_density_method_col must be a list or None."
)
elif isinstance(parasite_density_col, str):
if parasite_density_method_col is None:
density_method_pairs = [(parasite_density_col, None)]
elif isinstance(parasite_density_method_col, str):
density_method_pairs = [(parasite_density_col, parasite_density_method_col)]
else:
raise TypeError(
"If parasite_density_col is a string, parasite_density_method_col must be a string or None."
)
elif parasite_density_col is None:
if isinstance(parasite_density_method_col, list) or isinstance(
parasite_density_method_col, str
):
raise ValueError(
"parasite_density_method_col is set but parasite_density_col is None. Cannot proceed."
)
else:
raise TypeError(
"Invalid types for parasite_density_col and parasite_density_method_col."
)
# Add parasite density info to meta_json
for row in meta_json:
content_row = df[df[specimen_name_col] == row[specimen_name_col]]
density_infos = []
for density_col, method_col in density_method_pairs:
density_val = content_row[density_col].iloc[0] if density_col else None
method_val = content_row[method_col].iloc[0] if method_col else None
if density_val is not None:
info = {"parasite_density": density_val}
if method_val is not None:
info["parasite_density_method"] = method_val
density_infos.append(info)
if density_infos:
row[entry_name] = density_infos
return meta_json