from AMDirT.validate.application import AMDirValidator
from AMDirT.validate.exceptions import DatasetValidationError
import warnings
import pandas as pd
from AMDirT.core import logger, get_json_path
from json import load
from os.path import join
def get_remote_resources():
json_path = get_json_path()
with open(json_path, "r") as f:
return load(f)
[docs]def merge_new_df(
dataset,
table_type,
table_name,
markdown,
outdir,
verbose,
schema_check=True,
line_dup=True,
columns=True,
):
"""Merge a new dataset with the remote master dataset
Args:
dataset (Path): Path to new dataset
table_type (str): Type of table to merge (samples or libraries)
table_name (str): Kind of table to merge (e.g. ancientmetagenome-hostassociated, ancientmetagenome-environmental, etc.)
markdown (bool): Log in markdown format
outdir (Path): Path to output directory
verbose (bool): Enable verbose mode
schema_check (bool, optional): Enable schema check. Defaults to True.
line_dup (bool, optional): Enable line duplication check. Defaults to True.
columns (bool, optional): Enable columns presence/absence check. Defaults to True.
Raises:
ValueError: Table type must be either 'samples' or 'libraries'
ValueError: Table name not found in AncientMetagenomeDir file
DatasetValidationError: New dataset is not valid
"""
remote_resources = get_remote_resources()
if table_type not in ['samples', 'libraries']:
raise ValueError("table_type must be either 'samples' or 'libraries'")
if table_name not in remote_resources[table_type]:
raise ValueError("table_name not found in AncientMetagenomeDir file")
if not verbose:
warnings.filterwarnings("ignore")
schema = remote_resources[f"{table_type}_schema"][table_name]
dataset_valid = list()
v = AMDirValidator(schema, dataset)
dataset_valid.append(v.parsing_ok)
if schema_check and v.parsing_ok:
dataset_valid.append(v.validate_schema())
if line_dup and v.parsing_ok:
dataset_valid.append(v.check_duplicate_rows())
if columns and v.parsing_ok:
dataset_valid.append(v.check_columns())
dataset_valid = all(dataset_valid)
if dataset_valid is False:
if markdown:
v.to_markdown()
else:
v.to_rich()
raise DatasetValidationError("New Dataset is not valid")
else:
remote_dataset = pd.read_table(remote_resources[table_type][table_name], dtype=dict(v.dataset.dtypes))
logger.info("New Dataset is valid")
logger.info(f"Merging new dataset with remote {table_name} {table_type} dataset")
dataset = pd.concat([remote_dataset, v.dataset])
dataset.drop_duplicates(inplace=True)
dataset.to_csv(join(outdir,f"{table_name}_{table_type}.tsv"), sep="\t", na_rep= "NA", index=False)
logger.info(f"New {table_name} {table_type} dataset written to {join(outdir,f'{table_name}_{table_type}.tsv')}")