Skip to content

Commit

Permalink
handling sigs with empty names
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-eyes committed Oct 22, 2024
1 parent 7814703 commit 4df3447
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 22 deletions.
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"python.analysis.extraPaths": [
"./src"
],
"cSpell.words": [
"AMPLICON", "signame",
]
// # show line numbers in jupyter notebooks
// "jupyter.lineNumbers": "on",
Expand Down
18 changes: 6 additions & 12 deletions src/snipe/api/multisig_reference_QC.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,12 +391,7 @@ def process_sample(self, sample_sig: SnipeSig, predict_extra_folds: Optional[Lis


# ============= SAMPLE STATS =============

# if sample name is empty, we must set one with the file basename without extension
if not sample_sig.name:
sample_sig.name = os.path.basename(sample_sig.filename).split('.')[0]
self.logger.warning("Sample name is empty. Setting it to %s.", sample_sig.name)


self.logger.debug("Processing sample statistics.")
sample_stats_raw = sample_sig.get_sample_stats
sample_stats.update({
Expand Down Expand Up @@ -734,9 +729,9 @@ def sort_chromosomes(chr_name):

sample_nonref = sample_sig - self.reference_sig

self.logger.debug("\t-Size of non-reference k-mers in the sample signature: %d hashes.", len(sample_nonref))
self.logger.debug("\tSize of non-reference k-mers in the sample signature: %d hashes.", len(sample_nonref))
# sample_nonref.trim_singletons()
self.logger.debug("\t-Size of non-reference k-mers after trimming singletons: %d hashes.", len(sample_nonref))
self.logger.debug("\tSize of non-reference k-mers after trimming singletons: %d hashes.", len(sample_nonref))

sample_nonref_unique_hashes = len(sample_nonref)
sample_nonref_total_abundance = sample_nonref.total_abundance
Expand All @@ -752,12 +747,11 @@ def sort_chromosomes(chr_name):
sample_nonref_var: SnipeSig = sample_nonref & variance_sig

if self.export_varsigs:
__sample_name = sample_sig.name.replace(' ','_').lower()
__var_name = variance_name.replace(' ','_').lower()
__sample_name = sample_sig.name.replace(' ','_')
__var_name = variance_name.replace(' ','_')
__filename = os.path.basename(f"{__sample_name}_{__var_name}_nonref.zip".strip())
self.logger.debug("Exporting non-reference k-mers from variable '%s'.", __filename)
var_export_file_path = sample_nonref_var.export(__filename)
sample_nonref_var.export(var_export_file_path)
sample_nonref_var.export(__filename)

sample_nonref_var_total_abundance = sample_nonref_var.total_abundance
sample_nonref_var_fraction_total = sample_nonref_var_total_abundance / sample_nonref_total_abundance
Expand Down
28 changes: 22 additions & 6 deletions src/snipe/api/snipe_sig.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ def __init__(self, *,
else:
self.logger.debug("No signature found in the input. Expected a single sample signature.")
raise ValueError("No signature found in the input. Expected a single sample signature.")

if sig_type == SigType.AMPLICON:
for signame in sourmash_sigs.keys():
if signame == '-' or signame == '':
_emsg = "Amplicon signature must have a name!"
self.logger.error(_emsg)
raise ValueError(_emsg)

elif sig_type == SigType.GENOME:
if len(sourmash_sigs) > 1:
Expand All @@ -131,6 +138,10 @@ def __init__(self, *,
self.chr_to_sig[sig.name] = SnipeSig(sourmash_sig=sig, sig_type=SigType.AMPLICON, enable_logging=enable_logging)
elif signame.startswith("mitochondrial-"):
self.chr_to_sig[sig.name] = SnipeSig(sourmash_sig=sig, sig_type=SigType.AMPLICON, enable_logging=enable_logging)
elif signame == '-' or signame == '':
_emsg = "Reference signature must have a name!"
self.logger.error(_emsg)
raise ValueError(_emsg)
else:
continue
else:
Expand All @@ -143,14 +154,21 @@ def __init__(self, *,
else:
self.logger.debug("Unknown sigtype: %s", sig_type)
raise ValueError(f"Unknown sigtype: {sig_type}")


_sourmash_sig_name = _sourmash_sig.name
if _sourmash_sig_name == '-':
_sourmash_sig_name = ''

self.logger.debug("Length of currently loaded signature: %d, with name: %s", len(_sourmash_sig), _sourmash_sig.name)

self.logger.debug("Loaded %s with name %s", sig_type, _sourmash_sig_name)

# Extract properties from the loaded signature
self._ksize = _sourmash_sig.minhash.ksize
self._scale = _sourmash_sig.minhash.scaled
self._md5sum = _sourmash_sig.md5sum()
self._name = _sourmash_sig.name
self._name = _sourmash_sig_name
self.logger.debug("Loaded signature name is %s", self._name)
self._filename = _sourmash_sig.filename
self._track_abundance = _sourmash_sig.minhash.track_abundance

Expand Down Expand Up @@ -289,6 +307,7 @@ def name(self, name: str):
r"""
Set the name of the signature.
"""
self.logger.debug("Setting name to %s", name)
self._name = name

@track_abundance.setter
Expand Down Expand Up @@ -513,7 +532,7 @@ def _convert_to_sourmash_signature(self):
self.sourmash_sig = sourmash.signature.SourmashSignature(mh, name=self._name, filename=self._filename)
self.logger.debug("Conversion to sourmash.signature.SourmashSignature completed.")

def export(self, path, force=False) -> None:
def export(self, path) -> None:
r"""
Export the signature to a file.
Expand All @@ -528,9 +547,6 @@ def export(self, path, force=False) -> None:
sourmash.signature.save_signatures_to_json([self.sourmash_sig], fp)

elif path.endswith(".zip"):
if os.path.exists(path):
self.logger.debug(f"Output file already exists: {path}")
raise FileExistsError("Output file already exists.")
try:
with sourmash.save_load.SaveSignatures_ZipFile(path) as save_sigs:
save_sigs.add(self.sourmash_sig)
Expand Down
20 changes: 16 additions & 4 deletions src/snipe/cli/cli_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,11 @@ def process_subset(
subset_failed = []
for sample_path in subset:
sample_sig = SnipeSig(sourmash_sig=sample_path, sig_type=SigType.SAMPLE, enable_logging=debug)
if len(sample_sig.name) == 0:
# warn and set to basename without extension
subset_logger.debug(f"DELME Processing sample: {sample_sig.name}")
if sample_sig.name == "":
_newname = os.path.basename(sample_path).split('.')[0]
subset_logger.warning(f"Sample name is empty. Setting to: {_newname}")

sample_sig.name = _newname
subset_logger.warning(f"Sample name is empty. Setting to: `{sample_sig.name}`")

try:
sample_stats = qc_inst.process_sample(
Expand Down Expand Up @@ -678,6 +678,12 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str],
with tqdm(total=len(valid_samples), desc="Processing samples") as pbar:
for sample_path in valid_samples:
sample_sig = SnipeSig(sourmash_sig=sample_path, sig_type=SigType.SAMPLE, enable_logging=debug)
qc_instance.logger.debug(f"DELME Processing sample: {sample_sig.name}")
if sample_sig.name == "":
_newname = os.path.basename(sample_path).split('.')[0]
sample_sig.name = _newname
qc_instance.logger.warning(f"Sample name is empty. Setting to: `{sample_sig.name}`")

try:
sample_stats = qc_instance.process_sample(
sample_sig=sample_sig,
Expand Down Expand Up @@ -819,6 +825,12 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str],
metadata_str, metadata_md5sum = METADATA.export_and_verify_metadata(
metadata=export_metadata
)

# santize file_path and filename
df["filename"] = df["file_path"].apply(os.path.basename)
# drop file_path
df.drop(columns=["file_path"], inplace=True)


try:
with open(output, 'w', encoding='utf-8') as f:
Expand Down

0 comments on commit 4df3447

Please sign in to comment.