Source code for pmotools.pmo_builder.metatable_to_pmo

#!/usr/bin/env python3
import pandas as pd
import json
from .json_convert_utils import remove_optional_null_values, check_null_values


[docs]def pandas_table_to_json(contents: pd.DataFrame, return_indexed_dict: bool = False):
    """
    Convert a pandas dataframe table into a json dictionary, if there is an index column create a dictionary with the keys being the index

    :param contents: the dataframe to be converted
    :param return_indexed_dict: whether to return an indexed dictionary
    :return: a dictionary of the input table data
    """

    # Custom object_hook to replace None with an empty string
    def custom_object_hook(d):
        return {k: ("" if v is None else v) for k, v in d.items()}

    if return_indexed_dict:
        contents_json = json.loads(
            contents.to_json(orient="index", index=True, date_format="iso"),
            object_hook=custom_object_hook,
        )
    else:
        contents_json = json.loads(
            contents.to_json(orient="records", date_format="iso"),
            object_hook=custom_object_hook,
        )
    return contents_json


[docs]def library_sample_info_table_to_pmo(
    contents: pd.DataFrame,
    library_sample_name_col: str = "library_sample_name",
    specimen_name_col: str = "specimen_name",
    panel_name_col: str = "panel_name",
    sequencing_info_name_col: str = None,
    alternate_identifiers_col: str = None,
    experiment_accession_col: str = None,
    fastqs_loc_col: str = None,
    library_prep_plate_name_col: str = None,
    library_prep_plate_col_col: str = None,
    library_prep_plate_row_col: str = None,
    library_prep_plate_position_col: str = None,
    parasite_density_col: str = None,
    parasite_density_method_col: str = None,
    run_accession_col: str = None,
    additional_library_sample_info_cols: list | None = None,
    list_values_library_values: list | None = ["alternate_identifiers"],
    list_values_library_values_delimiter: str = ",",
):
    """
    Convert a DataFrame containing library information into JSON.

    :param contents: input DataFrame containing library data
    :type contents: pd.DataFrame
    :param library_sample_name_col: column name for library sample names. Default: library_sample_name
    :type library_sample_name_col: str
    :param specimen_name_col: column name for specimen names. Default: specimen_name
    :type specimen_name_col: str
    :param panel_name_col: column name for panel names. Default: panel_name
    :type panel_name_col: str
    :param sequencing_info_name_col: column name for sequencing information names
    :type sequencing_info_name_col: str, optional
    :param alternate_identifiers_col: column name for alternate identifiers
    :type alternate_identifiers_col: str, optional
    :param experiment_accession_col: column name for experiment accession information
    :type experiment_accession_col: str, optional
    :param fastqs_loc_col: column name for location of fastqs
    :type fastqs_loc_col: str, optional
    :param library_prep_plate_name_col: column name containing plate name for sequencing
    :type library_prep_plate_name_col: str, optional
    :param library_prep_plate_col_col: column name for the column of the sample on the sequencing plate
    :type library_prep_plate_col_col: str, optional
    :param library_prep_plate_row_col: column name for the row of the sample on the sequencing plate
    :type library_prep_plate_row_col: str, optional
    :param library_prep_plate_position_col: column name for position on the sequencing plate (e.g. A01). Can't be set if library_prep_plate_col_col and library_prep_plate_row_col are specified.
    :type library_prep_plate_position_col: str, optional
    :param parasite_density_col: the parasite density in parasites per microliter
    :type parasite_density_col: str or list of str, optional
    :param parasite_density_method_col: the method of how the density was obtained. If set, parasite_density_col must also be specified.
    :type parasite_density_method_col: str or list of str, optional
    :param run_accession_col: column name for run accession information
    :type run_accession_col: str, optional
    :param additional_library_sample_info_cols: additional column names to include
    :type additional_library_sample_info_cols: list of str, optional
    :param list_values_library_values: columns that contain values that could be a list, delimited by list_values_library_values_delimiter
    :type list_values_library_values: list of str, optional
    :param list_values_library_values_delimiter: delimiter between list_values_library_values. Default: ','
    :type list_values_library_values_delimiter: str
    :return: JSON format where keys are ``library_sample_id`` and values are corresponding row data
    :rtype: dict
    """
    # Check contents is a dataframe
    if not isinstance(contents, pd.DataFrame):
        raise ValueError("contents must be a pandas DataFrame.")

    copy_contents = contents.copy()
    column_mapping = {library_sample_name_col: "library_sample_name"}
    required_columns = list(column_mapping.keys())
    recommended_columns = [specimen_name_col, panel_name_col, sequencing_info_name_col]

    # Add optional columns
    optional_column_mapping = {
        specimen_name_col: "specimen_name",
        panel_name_col: "panel_name",
        sequencing_info_name_col: "sequencing_info_name",
        alternate_identifiers_col: "alternate_identifiers",
        experiment_accession_col: "experiment_accession",
        fastqs_loc_col: "fastqs_loc",
        run_accession_col: "run_accession",
    }
    column_mapping.update(
        {k: v for k, v in optional_column_mapping.items() if k is not None}
    )

    # Include additional user-defined columns if provided
    if additional_library_sample_info_cols:
        for col in additional_library_sample_info_cols:
            column_mapping[col] = col

    # Checks on columns selected
    check_unique_columns(
        [
            library_sample_name_col,
            sequencing_info_name_col,
            specimen_name_col,
            panel_name_col,
            alternate_identifiers_col,
            experiment_accession_col,
            fastqs_loc_col,
        ]
    )
    check_columns_exist(copy_contents, list(column_mapping.keys()))

    # Check for null values in required columns and recommended columns
    columns_to_check = required_columns
    recommended_columns_present = list(
        set(recommended_columns) & set(column_mapping.keys())
    )
    if len(recommended_columns_present) > 0:
        columns_to_check.extend(recommended_columns_present)
    check_null_values(copy_contents, columns_to_check)

    # Rename and subset columns
    selected_pmo_fields = list(column_mapping.values())
    copy_contents = copy_contents.rename(columns=column_mapping)
    subset_contents = copy_contents[selected_pmo_fields]

    # Convert to format
    meta_json = pandas_table_to_json(subset_contents)
    meta_json = add_plate_info(
        library_prep_plate_col_col,
        library_prep_plate_name_col,
        library_prep_plate_row_col,
        library_prep_plate_position_col,
        meta_json,
        copy_contents,
        "specimen_name",
        "library_prep_plate_info",
    )
    meta_json = add_parasite_density_info(
        parasite_density_col,
        parasite_density_method_col,
        meta_json,
        copy_contents,
        "library_sample_name",
        entry_name="parasite_density_info",
    )
    # listify columns that contain values that could be list, are delimited by the argument list_values_library_values_delimiter
    primitives = (int, float, str, bool, complex)
    for col in list_values_library_values:
        if col in copy_contents.columns:
            for lib in meta_json:
                if isinstance(lib[col], str):
                    lib[col] = lib[col].split(list_values_library_values_delimiter)
                elif isinstance(lib[col], list):
                    pass
                elif isinstance(lib[col], primitives):
                    lib[col] = [lib[col]]
                else:
                    raise ValueError(
                        f"Column '{col}' must contain either strings or lists of strings."
                    )

    meta_json = remove_optional_null_values(
        meta_json, list(optional_column_mapping.values())
    )
    return meta_json


[docs]def specimen_info_table_to_pmo(
    contents: pd.DataFrame,
    specimen_name_col: str = "specimen_name",
    specimen_taxon_id_col: str = None,
    host_taxon_id_col: str = None,
    collection_date_col: str = None,
    collection_country_col: str = None,
    project_name_col: str = None,
    alternate_identifiers_col: str = None,
    blood_meal_col: str = None,
    drug_usage_col: str = None,
    env_broad_scale_col: str = None,
    env_local_scale_col: str = None,
    env_medium_col: str = None,
    geo_admin1_col: str = None,
    geo_admin2_col: str = None,
    geo_admin3_col: str = None,
    gravid_col: str = None,
    gravidity_col: str = None,
    has_travel_out_six_month_col: str = None,
    host_age_col: str = None,
    host_sex_col: str = None,
    host_subject_id: str = None,
    lat_lon_col: str = None,
    parasite_density_col: str = None,
    parasite_density_method_col: str = None,
    specimen_accession_col: str = None,
    storage_plate_col_col: str = None,
    storage_plate_name_col: str = None,
    storage_plate_row_col: str = None,
    storage_plate_position_col: str = None,
    specimen_collect_device_col: str = None,
    specimen_comments_col: str = None,
    specimen_store_loc_col: str = None,
    specimen_type_col: str = None,
    treatment_status_col: str = None,
    additional_specimen_cols: list | None = None,
    list_values_specimen_values: list | None = [
        "alternate_identifiers",
        "drug_usage",
        "specimen_comments",
        "treatment_status",
        "specimen_taxon_id",
    ],
    list_values_specimen_values_delimiter: str = ",",
):
    """
    Convert a DataFrame containing specimen information into JSON.

    :param contents: the input DataFrame containing specimen data
    :type contents: pd.DataFrame
    :param specimen_name_col: the column name for specimen names. Default: specimen_name
    :type specimen_name_col: str
    :param specimen_taxon_id_col: NCBI taxonomy number of the organism
    :type specimen_taxon_id_col: str, optional
    :param host_taxon_id_col: NCBI taxonomy number of the host
    :type host_taxon_id_col: str, optional
    :param collection_date_col: date of the sample collection
    :type collection_date_col: str, optional
    :param collection_country_col: name of country collected in (admin level 0)
    :type collection_country_col: str, optional
    :param project_name_col: name of the project
    :type project_name_col: str, optional
    :param alternate_identifiers_col: list of optional alternative names for the samples
    :type alternate_identifiers_col: str, optional
    :param blood_meal_col: whether the host specimen has had a recent blood meal
    :type blood_meal_col: str, optional
    :param drug_usage_col: any drug used by the subject and the frequency of usage; can include multiple drugs used
    :type drug_usage_col: str, optional
    :param env_broad_scale_col: the broad environment from which the specimen was collected
    :type env_broad_scale_col: str, optional
    :param env_local_scale_col: the local environment from which the specimen was collected
    :type env_local_scale_col: str, optional
    :param env_medium_col: the environment medium from which the specimen was collected
    :type env_medium_col: str, optional
    :param geo_admin1_col: geographical admin level 1
    :type geo_admin1_col: str, optional
    :param geo_admin2_col: geographical admin level 2
    :type geo_admin2_col: str, optional
    :param geo_admin3_col: geographical admin level 3
    :type geo_admin3_col: str, optional
    :param gravid_col: whether the host specimen is pregnant
    :type gravid_col: str, optional
    :param gravidity_col: the number of previous pregnancies
    :type gravidity_col: str, optional
    :param has_travel_out_six_month_col: whether the host specimen has travelled out from the local region in the last six months
    :type has_travel_out_six_month_col: str, optional
    :param host_age_col: the age in years of the person
    :type host_age_col: str, optional
    :param host_sex_col: if the specimen is from a person, the sex of that person
    :type host_sex_col: str, optional
    :param host_subject_id: ID for the individual a specimen was collected from
    :type host_subject_id: str, optional
    :param lat_lon_col: latitude and longitude of the collection site
    :type lat_lon_col: str, optional
    :param parasite_density_col: the parasite density in parasites per microliter
    :type parasite_density_col: str or list of str, optional
    :param parasite_density_method_col: the method of how the density was obtained. If set, parasite_density_col must also be specified.
    :type parasite_density_method_col: str or list of str, optional
    :param specimen_accession_col: the accession number of the specimen
    :type specimen_accession_col: str, optional
    :param storage_plate_col_col: column the specimen was in on the plate. If set, storage_plate_row_col must also be specified.
    :type storage_plate_col_col: str, optional
    :param storage_plate_name_col: name of the plate the specimen was in
    :type storage_plate_name_col: str, optional
    :param storage_plate_row_col: row the specimen was in on the plate. If set, storage_plate_col_col must also be specified.
    :type storage_plate_row_col: str, optional
    :param storage_plate_position_col: position of the specimen on the plate (e.g. A01). Can't be set if storage_plate_col_col and storage_plate_row_col are specified.
    :type storage_plate_position_col: str, optional
    :param specimen_collect_device_col: the way the specimen was collected
    :type specimen_collect_device_col: str, optional
    :param specimen_comments_col: additional comments about the specimen
    :type specimen_comments_col: str, optional
    :param specimen_store_loc_col: specimen storage site
    :type specimen_store_loc_col: str, optional
    :param specimen_type_col: type of specimen, e.g. negative_control, positive_control, field_sample
    :type specimen_type_col: str, optional
    :param treatment_status_col: if the person has been treated with drugs, what the treatment outcome was
    :type treatment_status_col: str, optional
    :param additional_specimen_cols: additional column names to include
    :type additional_specimen_cols: list of str, optional
    :param list_values_specimen_values: columns that contain values that could be a list, delimited by list_values_specimen_values_delimiter
    :type list_values_specimen_values: list of str, optional
    :param list_values_specimen_values_delimiter: delimiter between list_values_specimen_values. Default: ','
    :type list_values_specimen_values_delimiter: str
    :return: JSON format where keys are ``specimen_name`` and values are corresponding row data
    :rtype: dict
    """
    # Check contents is a dataframe
    if not isinstance(contents, pd.DataFrame):
        raise ValueError("contents must be a pandas DataFrame.")

    copy_contents = contents.copy()

    column_mapping = {specimen_name_col: "specimen_name"}
    required_columns = list(column_mapping.keys())
    recommended_columns = [
        specimen_taxon_id_col,
        host_taxon_id_col,
        collection_date_col,
        collection_country_col,
        project_name_col,
    ]
    optional_column_mapping = {
        specimen_taxon_id_col: "specimen_taxon_id",
        host_taxon_id_col: "host_taxon_id",
        collection_date_col: "collection_date",
        collection_country_col: "collection_country",
        project_name_col: "project_name",
        alternate_identifiers_col: "alternate_identifiers",
        drug_usage_col: "drug_usage",
        blood_meal_col: "blood_meal",
        gravid_col: "gravid",
        gravidity_col: "gravidity",
        has_travel_out_six_month_col: "has_travel_out_six_month",
        env_broad_scale_col: "env_broad_scale",
        env_local_scale_col: "env_local_scale",
        env_medium_col: "env_medium",
        geo_admin1_col: "geo_admin1",
        geo_admin2_col: "geo_admin2",
        geo_admin3_col: "geo_admin3",
        host_age_col: "host_age",
        host_sex_col: "host_sex",
        host_subject_id: "host_subject_id",
        lat_lon_col: "lat_lon",
        specimen_accession_col: "specimen_accession",
        specimen_type_col: "specimen_type",
        treatment_status_col: "treatment_status",
        specimen_collect_device_col: "specimen_collect_device",
        specimen_comments_col: "specimen_comments",
        specimen_store_loc_col: "specimen_store_loc",
    }

    column_mapping.update(
        {k: v for k, v in optional_column_mapping.items() if k is not None}
    )

    # Include additional user-defined columns if provided
    if additional_specimen_cols:
        # selected_columns += additional_specimen_cols
        for col in additional_specimen_cols:
            column_mapping[col] = col

    # Check column selection
    check_unique_columns(
        [
            specimen_name_col,
            specimen_taxon_id_col,
            host_taxon_id_col,
            collection_date_col,
            collection_country_col,
            project_name_col,
            alternate_identifiers_col,
            drug_usage_col,
            env_broad_scale_col,
            env_local_scale_col,
            env_medium_col,
            geo_admin1_col,
            geo_admin2_col,
            geo_admin3_col,
            host_age_col,
            host_sex_col,
            host_subject_id,
            lat_lon_col,
            specimen_accession_col,
            specimen_type_col,
            treatment_status_col,
            storage_plate_col_col,
            storage_plate_name_col,
            storage_plate_row_col,
            storage_plate_position_col,
            specimen_collect_device_col,
            specimen_comments_col,
            specimen_store_loc_col,
            blood_meal_col,
            gravid_col,
            gravidity_col,
            has_travel_out_six_month_col,
        ]
    )
    check_columns_exist(copy_contents, list(column_mapping.keys()))

    # Check for null values in required columns and recommended columns
    columns_to_check = required_columns
    recommended_columns_present = list(
        set(recommended_columns) & set(column_mapping.keys())
    )
    if len(recommended_columns_present) > 0:
        columns_to_check.extend(recommended_columns_present)
    check_null_values(copy_contents, columns_to_check)

    # Rename and subset columns
    selected_pmo_fields = list(column_mapping.values())
    copy_contents = copy_contents.rename(columns=column_mapping)

    subset_contents = copy_contents[selected_pmo_fields]
    meta_json = pandas_table_to_json(subset_contents)
    meta_json = add_parasite_density_info(
        parasite_density_col,
        parasite_density_method_col,
        meta_json,
        copy_contents,
        "specimen_name",
        entry_name="parasite_density_info",
    )

    meta_json = add_plate_info(
        storage_plate_col_col,
        storage_plate_name_col,
        storage_plate_row_col,
        storage_plate_position_col,
        meta_json,
        copy_contents,
        "specimen_name",
        entry_name="storage_plate_info",
    )

    # listify columns that contain values that could be list, are delimited by the argument list_values_specimen_values_delimiter
    primitives = (int, float, str, bool, complex)
    for col in list_values_specimen_values:
        if col in copy_contents.columns:
            for spec in meta_json:
                if isinstance(spec[col], str):
                    spec[col] = spec[col].split(list_values_specimen_values_delimiter)
                elif isinstance(spec[col], list):
                    pass
                elif isinstance(spec[col], primitives):
                    spec[col] = [spec[col]]
                else:
                    raise ValueError(
                        f"Column '{col}' must contain either strings or lists of strings."
                    )

    meta_json = remove_optional_null_values(
        meta_json, list(optional_column_mapping.values())
    )
    return meta_json


[docs]def check_unique_columns(columns):
    cols_to_check = [col for col in columns if col is not None]
    if len(cols_to_check) != len(set(cols_to_check)):
        raise ValueError("Selected columns must be unique.")


[docs]def check_columns_exist(df, columns):
    missing_cols = []
    df_columns = df.columns
    for col in columns:
        if col not in df_columns:
            missing_cols.append(col)
    if missing_cols:
        raise ValueError(
            f"The following columns are not in the DataFrame: {missing_cols}"
        )


[docs]def add_plate_info(
    plate_col_col,
    plate_name_col,
    plate_row_col,
    plate_position_col,
    meta_json,
    df,
    specimen_name_col,
    entry_name="plate_info",
):
    if all(
        col is None
        for col in [plate_col_col, plate_name_col, plate_row_col, plate_position_col]
    ):
        return meta_json

    # If one of col or row are set both must be
    if (plate_row_col is None) != (plate_col_col is None):
        raise ValueError("If either plate row or column is set, then both must be.")
    # Check position isn't specified in multiple ways
    if plate_position_col:
        if plate_col_col:
            raise ValueError(
                "Plate position can be specified using either row and col, or position, but not both."
            )
        else:
            plate_row_col = "plate_row"
            plate_col_col = "plate_col"

            try:
                df[plate_row_col] = (
                    df[plate_position_col].str.extract(r"(?i)^([A-H])")[0].str.upper()
                )
                df[plate_col_col] = (
                    df[plate_position_col]
                    .str.extract(r"(?i)^[A-H]0*([1-9]|1[0-2])$")[0]
                    .astype(int)
                )
            except (AttributeError, ValueError, IndexError, KeyError) as e:
                raise ValueError(
                    f"Values in '{plate_position_col}' must start with a single letter A-H/a-h followed by number 1-12."
                ) from e

    for row in meta_json:
        content_row = df[df[specimen_name_col] == row[specimen_name_col]]
        plate_name_val = content_row[plate_name_col].iloc[0] if plate_name_col else None
        plate_row_val = (
            content_row[plate_row_col].iloc[0].upper() if plate_row_col else None
        )
        plate_col_val = content_row[plate_col_col].iloc[0] if plate_col_col else None
        if plate_col_val is not None and not pd.isna(plate_col_val):
            try:
                plate_col_val = int(plate_col_val)
            except (TypeError, ValueError):
                plate_col_val = plate_col_val
        plate_info = {}
        if plate_name_val:
            plate_info["plate_name"] = plate_name_val
        if plate_row_val:
            plate_info["plate_row"] = plate_row_val
        if plate_col_val is not None and not pd.isna(plate_col_val):
            plate_info["plate_col"] = plate_col_val

        if plate_info:
            row[entry_name] = plate_info
    return meta_json


[docs]def add_parasite_density_info(
    parasite_density_col,
    parasite_density_method_col,
    meta_json,
    df,
    specimen_name_col,
    entry_name,
):
    density_method_pairs = []
    if parasite_density_col is None and parasite_density_method_col is None:
        pass

    elif isinstance(parasite_density_col, list):
        if parasite_density_method_col is None:
            density_method_pairs = [(d_col, None) for d_col in parasite_density_col]
        elif isinstance(parasite_density_method_col, list):
            if len(parasite_density_col) != len(parasite_density_method_col):
                raise ValueError(
                    "If both parasite_density_col and parasite_density_method_col are lists, they must be the same length."
                )
            density_method_pairs = list(
                zip(parasite_density_col, parasite_density_method_col)
            )
        else:
            raise TypeError(
                "If parasite_density_col is a list, parasite_density_method_col must be a list or None."
            )

    elif isinstance(parasite_density_col, str):
        if parasite_density_method_col is None:
            density_method_pairs = [(parasite_density_col, None)]
        elif isinstance(parasite_density_method_col, str):
            density_method_pairs = [(parasite_density_col, parasite_density_method_col)]
        else:
            raise TypeError(
                "If parasite_density_col is a string, parasite_density_method_col must be a string or None."
            )

    elif parasite_density_col is None:
        if isinstance(parasite_density_method_col, list) or isinstance(
            parasite_density_method_col, str
        ):
            raise ValueError(
                "parasite_density_method_col is set but parasite_density_col is None. Cannot proceed."
            )

    else:
        raise TypeError(
            "Invalid types for parasite_density_col and parasite_density_method_col."
        )

    # Add parasite density info to meta_json
    for row in meta_json:
        content_row = df[df[specimen_name_col] == row[specimen_name_col]]
        density_infos = []
        for density_col, method_col in density_method_pairs:
            density_val = content_row[density_col].iloc[0] if density_col else None
            method_val = content_row[method_col].iloc[0] if method_col else None
            if density_val is not None:
                info = {"parasite_density": density_val}
                if method_val is not None:
                    info["parasite_density_method"] = method_val
                density_infos.append(info)
        if density_infos:
            row[entry_name] = density_infos
    return meta_json