Source code for AMDirT.merge

from AMDirT.validate.application import AMDirValidator
from AMDirT.validate.exceptions import DatasetValidationError
import warnings
import pandas as pd
from AMDirT.core import logger, get_json_path
from json import load
from os.path import join

def get_remote_resources():
    json_path = get_json_path()
    with open(json_path, "r") as f:
        return load(f)


[docs]def merge_new_df(
    dataset,
    table_type,
    table_name,
    markdown,
    outdir,
    verbose,
    schema_check=True,
    line_dup=True,
    columns=True,
):  
    """Merge a new dataset with the remote master dataset

    Args:
        dataset (Path): Path to new dataset
        table_type (str): Type of table to merge (samples or libraries)
        table_name (str): Kind of table to merge (e.g. ancientmetagenome-hostassociated, ancientmetagenome-environmental, etc.)
        markdown (bool): Log in markdown format
        outdir (Path): Path to output directory
        verbose (bool): Enable verbose mode
        schema_check (bool, optional): Enable schema check. Defaults to True.
        line_dup (bool, optional): Enable line duplication check. Defaults to True.
        columns (bool, optional): Enable columns presence/absence check. Defaults to True.

    Raises:
        ValueError: Table type must be either 'samples' or 'libraries'
        ValueError: Table name not found in AncientMetagenomeDir file
        DatasetValidationError: New dataset is not valid
    """      
    remote_resources = get_remote_resources()

    if table_type not in ['samples', 'libraries']:
        raise ValueError("table_type must be either 'samples' or 'libraries'")
    if table_name not in remote_resources[table_type]:
        raise ValueError("table_name not found in AncientMetagenomeDir file")
    if not verbose:
        warnings.filterwarnings("ignore")

    schema = remote_resources[f"{table_type}_schema"][table_name]
    dataset_valid = list()
    v = AMDirValidator(schema, dataset)
    dataset_valid.append(v.parsing_ok)
    if schema_check and v.parsing_ok:
        dataset_valid.append(v.validate_schema())
    if line_dup and v.parsing_ok:
        dataset_valid.append(v.check_duplicate_rows())
    if columns and v.parsing_ok:
        dataset_valid.append(v.check_columns())

    dataset_valid = all(dataset_valid)
    if dataset_valid is False:
        if markdown:
            v.to_markdown()
        else:
            v.to_rich()
        raise DatasetValidationError("New Dataset is not valid")
    
    else:
        remote_dataset = pd.read_table(remote_resources[table_type][table_name], dtype=dict(v.dataset.dtypes))

        logger.info("New Dataset is valid")
        logger.info(f"Merging new dataset with remote {table_name} {table_type} dataset")
        dataset = pd.concat([remote_dataset, v.dataset])
        dataset.drop_duplicates(inplace=True)
        dataset.to_csv(join(outdir,f"{table_name}_{table_type}.tsv"), sep="\t", na_rep= "NA", index=False)
        logger.info(f"New {table_name} {table_type} dataset written to {join(outdir,f'{table_name}_{table_type}.tsv')}")