Skip to content

Commit

Permalink
fix ValueError: No objects to concatenate in summary
Browse files Browse the repository at this point in the history
  • Loading branch information
rpetit3 committed Aug 21, 2023
1 parent 493479d commit 1c8d4a9
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 100 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

## 1.0.6

- Fixed `bactopia-summary` handling of empty searches

## 1.0.5

- Fixed `bactopia-download` not building prokka and bakta conda envs
Expand Down
192 changes: 98 additions & 94 deletions bactopia/cli/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,110 +404,114 @@ def summary(
dfs.append(df)
logging.debug(f"\tRank: {rank} ({reason})")
else:
missing_files = ";".join(parsable_files)
logging.debug(
f"Skipping {sample['id']} ({sample['path']}) due to missing files: {missing_files}"
f"Skipping {sample['id']} ({sample['path']}) due to missing files. Missing:"
)
for missing_file in parsable_files:
logging.debug(f"\t{missing_file}")
increment_and_append("ignore-unknown", sample["id"])
else:
logging.debug(
f"Skipping {sample['id']} ({sample['path']}), incomplete or not a Bactopia directory"
)
increment_and_append("ignore-unknown", sample["id"])
final_df = pd.concat(dfs)
for col in EXCLUDE_COLUMNS:
if col in final_df.columns:
final_df.drop(col, axis=1, inplace=True)

# Reorder the columns
col_order = [
"sample",
"rank",
"reason",
"genome_size",
"species",
"runtype",
"original_runtype",
"mlst_scheme",
"mlst_st",
]
for col in final_df.columns:
if col not in col_order:
col_order.append(col)
final_df = final_df[col_order]

# Tab-delimited report
logging.info(f"Writing report: {txt_report}")
final_df.to_csv(txt_report, sep="\t", index=False)

# Exclusion report
logging.info(f"Writing exclusion report: {exclusion_report}")
cutoff_counts = defaultdict(int)
with open(exclusion_report, "w") as exclude_fh:
exclude_fh.write("sample\tstatus\treason\n")
for name, reason in CATEGORIES["failed"]:
if name in processed_samples:
reasons = reason.split(":")[1].split(";")
cutoffs = []
for r in reasons:
cutoffs.append(r.split("(")[0].strip().title())
cutoff_counts[";".join(sorted(cutoffs))] += 1
exclude_fh.write(f"{name}\texclude\t{reason}\n")
else:
exclude_fh.write(f"{name}\tqc-fail\t{reason}\n")

# Screen report
logging.info(f"Writing summary report: {summary_report}")
with open(summary_report, "w") as summary_fh:
summary_fh.write("Bactopia Summary Report\n")
summary_fh.write(
textwrap.dedent(
f"""
Total Samples: {COUNTS['total']}
Passed: {COUNTS["pass"]}
Gold: {COUNTS["gold"]}
Silver: {COUNTS["silver"]}
Bronze: {COUNTS["bronze"]}
Excluded: {COUNTS["total-excluded"]}
Failed Cutoff: {COUNTS["exclude"]}\n"""
if dfs:
final_df = pd.concat(dfs)
for col in EXCLUDE_COLUMNS:
if col in final_df.columns:
final_df.drop(col, axis=1, inplace=True)

# Reorder the columns
col_order = [
"sample",
"rank",
"reason",
"genome_size",
"species",
"runtype",
"original_runtype",
"mlst_scheme",
"mlst_st",
]
for col in final_df.columns:
if col not in col_order:
col_order.append(col)
final_df = final_df[col_order]

# Tab-delimited report
logging.info(f"Writing report: {txt_report}")
final_df.to_csv(txt_report, sep="\t", index=False)

# Exclusion report
logging.info(f"Writing exclusion report: {exclusion_report}")
cutoff_counts = defaultdict(int)
with open(exclusion_report, "w") as exclude_fh:
exclude_fh.write("sample\tstatus\treason\n")
for name, reason in CATEGORIES["failed"]:
if name in processed_samples:
reasons = reason.split(":")[1].split(";")
cutoffs = []
for r in reasons:
cutoffs.append(r.split("(")[0].strip().title())
cutoff_counts[";".join(sorted(cutoffs))] += 1
exclude_fh.write(f"{name}\texclude\t{reason}\n")
else:
exclude_fh.write(f"{name}\tqc-fail\t{reason}\n")

# Screen report
logging.info(f"Writing summary report: {summary_report}")
with open(summary_report, "w") as summary_fh:
summary_fh.write("Bactopia Summary Report\n")
summary_fh.write(
textwrap.dedent(
f"""
Total Samples: {COUNTS['total']}
Passed: {COUNTS["pass"]}
Gold: {COUNTS["gold"]}
Silver: {COUNTS["silver"]}
Bronze: {COUNTS["bronze"]}
Excluded: {COUNTS["total-excluded"]}
Failed Cutoff: {COUNTS["exclude"]}\n"""
)
)
)
summary_fh.write(f"{print_cutoffs(cutoff_counts)}\n")
summary_fh.write(f' QC Failure: {COUNTS["qc-failure"]}\n')
summary_fh.write(f"{print_failed(FAILED)}\n")
summary_fh.write(
textwrap.dedent(
f"""
Reports:
Full Report (txt): {txt_report}
Exclusion: {exclusion_report}
Summary: {summary_report}
Rank Cutoffs:
Gold:
Coverage >= {RANK_CUTOFF['gold']['coverage']}x
Quality >= Q{RANK_CUTOFF['gold']['quality']}
Read Length >= {RANK_CUTOFF['gold']['length']}bp
Total Contigs < {RANK_CUTOFF['gold']['contigs']}
Silver:
Coverage >= {RANK_CUTOFF['silver']['coverage']}x
Quality >= Q{RANK_CUTOFF['silver']['quality']}
Read Length >= {RANK_CUTOFF['silver']['length']}bp
Total Contigs < {RANK_CUTOFF['silver']['contigs']}
Bronze:
Coverage >= {RANK_CUTOFF['bronze']['coverage']}x
Quality >= Q{RANK_CUTOFF['bronze']['quality']}
Read Length >= {RANK_CUTOFF['bronze']['length']}bp
Total Contigs < {RANK_CUTOFF['bronze']['contigs']}
Assembly Length Exclusions:
Minimum: {RANK_CUTOFF['min-assembled-size']}
Maximum: {RANK_CUTOFF['min-assembled-size']}
"""
summary_fh.write(f"{print_cutoffs(cutoff_counts)}\n")
summary_fh.write(f' QC Failure: {COUNTS["qc-failure"]}\n')
summary_fh.write(f"{print_failed(FAILED)}\n")
summary_fh.write(
textwrap.dedent(
f"""
Reports:
Full Report (txt): {txt_report}
Exclusion: {exclusion_report}
Summary: {summary_report}
Rank Cutoffs:
Gold:
Coverage >= {RANK_CUTOFF['gold']['coverage']}x
Quality >= Q{RANK_CUTOFF['gold']['quality']}
Read Length >= {RANK_CUTOFF['gold']['length']}bp
Total Contigs < {RANK_CUTOFF['gold']['contigs']}
Silver:
Coverage >= {RANK_CUTOFF['silver']['coverage']}x
Quality >= Q{RANK_CUTOFF['silver']['quality']}
Read Length >= {RANK_CUTOFF['silver']['length']}bp
Total Contigs < {RANK_CUTOFF['silver']['contigs']}
Bronze:
Coverage >= {RANK_CUTOFF['bronze']['coverage']}x
Quality >= Q{RANK_CUTOFF['bronze']['quality']}
Read Length >= {RANK_CUTOFF['bronze']['length']}bp
Total Contigs < {RANK_CUTOFF['bronze']['contigs']}
Assembly Length Exclusions:
Minimum: {RANK_CUTOFF['min-assembled-size']}
Maximum: {RANK_CUTOFF['min-assembled-size']}
"""
)
)
)
else:
logging.warning("No samples found to process!")


def main():
Expand Down
9 changes: 4 additions & 5 deletions bactopia/parsers/parsables.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,9 @@ def get_parsable_files(path: str, name: str) -> list:
f"{path}/main/assembler/{name}.tsv": "assembler",
# gather
f"{path}/main/gather/{name}-meta.tsv": "gather",
# qc
f"{path}/main/qc/summary/{name}-final.json": "qc",
f"{path}/main/qc/summary/{name}-original.json": "qc",
# sketcher
f"{path}/main/sketcher/summary/{name}-mash-refseq88-k21.txt": "sketcher",
f"{path}/main/sketcher/summary/{name}-sourmash-gtdb-rs207-k31.txt": "sketcher",
f"{path}/main/sketcher/{name}-mash-refseq88-k21.txt": "sketcher",
f"{path}/main/sketcher/{name}-sourmash-gtdb-rs207-k31.txt": "sketcher",
# bactopia-tools
# amrfinderplus
f"{path}/tools/amrfinderplus/{name}-genes.tsv": "amrfinderplus",
Expand All @@ -53,6 +50,8 @@ def get_parsable_files(path: str, name: str) -> list:
missing_files.append(output_file)

if is_complete:
parsable_files[f"{path}/main/qc/summary/{name}-original.json"] = "qc"
parsable_files[f"{path}/main/qc/summary/{name}-final.json"] = "qc"
return [is_complete, parsable_files]
else:
return [is_complete, missing_files]
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "bactopia"
version = "1.0.5"
version = "1.0.6"
description = "A Python package for working with Bactopia"
authors = [
"Robert A. Petit III <[email protected]>",
Expand Down

0 comments on commit 1c8d4a9

Please sign in to comment.