Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Handle missing bam columns in units.tsv #105

Merged
merged 15 commits into from
Aug 5, 2024
2 changes: 1 addition & 1 deletion .test/config/units.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sample unit fragment_len_mean fragment_len_sd fq1 fq2 bam_single bam_paired
sample unit fragment_len_mean fragment_len_sd fq1 fq2
A 1 ngs-test-data/reads/a.chr21.1.fq ngs-test-data/reads/a.chr21.2.fq
B 1 ngs-test-data/reads/b.chr21.1.fq ngs-test-data/reads/b.chr21.2.fq
B 2 300 14 ngs-test-data/reads/b.chr21.1.fq
Expand Down
30 changes: 22 additions & 8 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,35 @@ def get_model(wildcards):
return config["diffexp"]["models"][wildcards.model]


def column_missing_or_empty(column_name, dataframe, sample, unit):
if column_name in dataframe.columns:
return pd.isnull(dataframe.loc[(sample, unit), column_name])
else:
return True


def is_single_end(sample, unit):
"""Determine whether unit is single-end."""
bam_paired_not_present = pd.isnull(units.loc[(sample, unit), "bam_paired"])
fq2_not_present = pd.isnull(units.loc[(sample, unit), "fq2"])
return fq2_not_present and bam_paired_not_present
return column_missing_or_empty(
"fq2", units, sample, unit
) and column_missing_or_empty("bam_paired", units, sample, unit)


def get_fastqs(wildcards):
"""Get raw FASTQ files from unit sheet."""
if not pd.isnull(units.loc[(wildcards.sample, wildcards.unit), "bam_single"]):
if not column_missing_or_empty(
"bam_single", units, wildcards.sample, wildcards.unit
):
return f"results/fastq/{wildcards.sample}-{wildcards.unit}.fq.gz"
elif not pd.isnull(units.loc[(wildcards.sample, wildcards.unit), "bam_paired"]):
fqfrombam1 = f"results/fastq/{wildcards.sample}-{wildcards.unit}.1.fq.gz"
fqfrombam2 = f"results/fastq/{wildcards.sample}-{wildcards.unit}.2.fq.gz"
return [fqfrombam1, fqfrombam2]
elif not column_missing_or_empty(
"bam_paired", units, wildcards.sample, wildcards.unit
):
return expand(
"results/fastq/{sample}-{unit}.{read}.fq.gz",
sample=wildcards.sample,
unit=wildcards.unit,
read=["1", "2"],
)
elif is_single_end(wildcards.sample, wildcards.unit):
return units.loc[(wildcards.sample, wildcards.unit), "fq1"]
else:
Expand Down
Loading