From ee1737ec07e1b4e12ac6f36e7d4e97a123b954b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Fri, 21 Feb 2025 09:40:30 +0100 Subject: [PATCH 1/4] feat: impact graphs --- config/config.yaml | 3 + workflow/Snakefile | 1 + workflow/envs/predictosaurus.yaml | 5 ++ workflow/rules/common.smk | 23 +++++--- workflow/rules/impact_graphs.smk | 88 +++++++++++++++++++++++++++++ workflow/rules/ref.smk | 6 +- workflow/rules/table.smk | 4 +- workflow/rules/testcase.smk | 38 +++++++++++++ workflow/schemas/config.schema.yaml | 8 +++ 9 files changed, 165 insertions(+), 11 deletions(-) create mode 100644 workflow/envs/predictosaurus.yaml create mode 100644 workflow/rules/impact_graphs.smk diff --git a/config/config.yaml b/config/config.yaml index 3368a5678..1fbe77ce6 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -77,6 +77,9 @@ mutational_signatures: # select events (callsets, defined under calling/fdr-control/events) to evaluate - some_id +impact_graphs: + activate: false + # Sets the minimum average coverage for each gene. # Genes with lower average coverage will not be concidered in gene coverage datavzrd report # If not present min_avg_coverage will be set to 0 rendering all genes. diff --git a/workflow/Snakefile b/workflow/Snakefile index 7335685ea..dae2f5d3c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -40,6 +40,7 @@ include: "rules/fusion_calling.smk" include: "rules/testcase.smk" include: "rules/population.smk" include: "rules/mutational_signatures.smk" +include: "rules/impact_graphs.smk" batches = "all" diff --git a/workflow/envs/predictosaurus.yaml b/workflow/envs/predictosaurus.yaml new file mode 100644 index 000000000..8147dba59 --- /dev/null +++ b/workflow/envs/predictosaurus.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - predictosaurus =0.2.10 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index a2dec48e6..9f5fc18a0 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -248,6 +248,7 @@ def get_final_output(wildcards): ) final_output.extend(get_mutational_burden_targets()) final_output.extend(get_mutational_signature_targets()) + final_output.extend(get_impact_graph_targets()) if is_activated("population/db"): final_output.append(lookup(dpath="population/db/path", within=config)) @@ -683,17 +684,25 @@ def get_mutational_burden_targets(): def get_mutational_signature_targets(): mutational_signature_targets = [] if is_activated("mutational_signatures"): - for group in variants_groups: - mutational_signature_targets.extend( - expand( - "results/plots/mutational_signatures/{group}.{event}.html", - group=variants_groups, - event=config["mutational_signatures"].get("events"), - ) + mutational_signature_targets.extend( + expand( + "results/plots/mutational_signatures/{group}.{event}.html", + group=variants_groups, + event=config["mutational_signatures"].get("events"), ) + ) return mutational_signature_targets +def get_impact_graph_targets(): + impact_graph_targets = [] + if is_activated("impact_graphs"): + impact_graph_targets.extend( + expand("results/impact_graphs/{group}", group=variants_groups) + ) + return impact_graph_targets + + def get_scattered_calls(ext="bcf"): def inner(wildcards): caller = "arriba" if wildcards.calling_type == "fusions" else variant_caller diff --git a/workflow/rules/impact_graphs.smk b/workflow/rules/impact_graphs.smk new file mode 100644 index 000000000..d9fe6d02c --- /dev/null +++ b/workflow/rules/impact_graphs.smk @@ -0,0 +1,88 @@ +rule gather_calls: + input: + calls=gather.calling("results/calls/{{group}}.{{caller}}.{scatteritem}.bcf"), + output: + "results/calls/{group}/{caller}.bcf", + log: + "logs/gather-calls/{group}/{caller}.log", + params: + uncompressed_bcf=False, + extra="", + threads: 4 + resources: + mem_mb=10, + wrapper: + "v5.8.2/bio/bcftools/concat" + + +def get_predictosaurus_build_aux(wildcards): + aux = [] + for sample in get_group_samples(wildcards.group): + aux.append( + "{alias}=results/observations/{group}/{sample}.{caller}.all.bcf".format( + alias=samples.loc[samples["sample_name"] == sample]["alias"].iloc[0], + caller=wildcards.caller, + group=wildcards.group, + sample=sample, + ) + ) + return " ".join(aux) + + +def get_all_group_observations(wildcards): + return expand( + "results/observations/{group}/{sample}.{caller}.all.bcf", + caller=wildcards.caller, + group=wildcards.group, + sample=get_group_samples(wildcards.group), + ) + + +rule predictosaurus_build: + input: + calls="results/calls/{group}.{caller}.bcf", + obs=get_all_group_observations, + output: + "results/impact_graphs/{group}.{caller}.graphs.duckdb", + params: + obs_aux=get_predictosaurus_build_aux, + log: + "logs/predictosaurus/build/{group}_{caller}.log", + conda: + "../envs/predictosaurus.yaml" + shell: + """ + predictosaurus build -v --calls {input.calls} --observations {params.obs_aux} --output {output} 2> {log} + """ + + +rule predictosaurus_process: + input: + ref=genome, + gff="resources/annotation.gff3", + graph="results/impact_graphs/{group}.freebayes.graphs.duckdb", + output: + "results/impact_graphs/{group}.paths.duckdb", + log: + "logs/predictosaurus/process/{group}.log", + conda: + "../envs/predictosaurus.yaml" + shell: + """ + predictosaurus process -v --features {input.gff} --reference {input.ref} --graph {input.graph} --output {output} 2> {log} + """ + + +rule predictosaurus_plot: + input: + "results/impact_graphs/{group}.paths.duckdb", + output: + "results/impact_graphs/{group}", + log: + "logs/predictosaurus/plot/{group}.log", + conda: + "../envs/predictosaurus.yaml" + shell: + """ + predictosaurus plot --input {input} --format html --output {output} 2> {log} + """ diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index fb936e153..5bdbd25c6 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -61,14 +61,16 @@ rule get_known_variants: rule get_annotation: output: - "resources/annotation.gtf", + "resources/annotation.{ext}", params: species=config["ref"]["species"], release=config["ref"]["release"], build=config["ref"]["build"], flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. + wildcard_constraints: + ext="gft|gff3", log: - "logs/get_annotation.log", + "logs/get_annotation_{ext}.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "v2.3.2/bio/reference/ensembl-annotation" diff --git a/workflow/rules/table.smk b/workflow/rules/table.smk index 9b8028bbe..a850abe70 100644 --- a/workflow/rules/table.smk +++ b/workflow/rules/table.smk @@ -3,7 +3,7 @@ rule vembrane_table: bcf="results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.normal-probs.bcf", scenario="results/scenarios/{group}.yaml", output: - bcf="results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv", + tsv="results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv", conda: "../envs/vembrane.yaml" params: @@ -12,7 +12,7 @@ rule vembrane_table: "logs/vembrane-table/{group}.{event}.{calling_type}.log", shell: 'vembrane table --header "{params.config[header]}" "{params.config[expr]}" ' - "{input.bcf} > {output.bcf} 2> {log}" + "{input.bcf} > {output.tsv} 2> {log}" rule tsv_to_excel: diff --git a/workflow/rules/testcase.smk b/workflow/rules/testcase.smk index c2ca67620..800ea1160 100644 --- a/workflow/rules/testcase.smk +++ b/workflow/rules/testcase.smk @@ -1,3 +1,41 @@ +rule sort_observations: + input: + "results/observations/{group}/{sample}.{caller}.{scatteritem}.bcf", + output: + temp("results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf"), + params: + # Set to True, in case you want uncompressed BCF output + uncompressed_bcf=False, + # Extra arguments + extras="", + log: + "logs/bcf-sort/{group}/{sample}.{caller}.{scatteritem}.log", + resources: + mem_mb=8000, + wrapper: + "v5.5.0/bio/bcftools/sort" + + +rule index_observations: + input: + "results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf", + output: + temp( + "results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf.csi" + ), + params: + # Set to True, in case you want uncompressed BCF output + uncompressed_bcf=False, + # Extra arguments + extras="", + log: + "logs/bcf-index/{group}/{sample}.{caller}.{scatteritem}.log", + resources: + mem_mb=8000, + wrapper: + "v5.5.0/bio/bcftools/index" + + rule gather_observations: input: calls=gather.calling( diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 0e162f24d..82802a294 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -193,6 +193,14 @@ properties: type: boolean required: - activate + + impact_graphs: + type: object + properties: + activate: + type: boolean + required: + - activate calling: From 7472d1c648f56611049fb90c798bc76755345bd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Fri, 4 Jul 2025 10:59:51 +0200 Subject: [PATCH 2/4] fix: multiple minor fixes --- workflow/envs/arriba.post-deploy.sh | 4 ---- workflow/envs/arriba.yaml | 2 +- workflow/rules/common.smk | 2 +- workflow/rules/fusion_calling.smk | 2 +- workflow/scripts/split-call-tables.py | 1 - 5 files changed, 3 insertions(+), 8 deletions(-) delete mode 100644 workflow/envs/arriba.post-deploy.sh diff --git a/workflow/envs/arriba.post-deploy.sh b/workflow/envs/arriba.post-deploy.sh deleted file mode 100644 index 8a72a7660..000000000 --- a/workflow/envs/arriba.post-deploy.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!env bash - -# Replace convert_fusions_to_vcf.sh script by updated one until new version of arriba is released -curl https://raw.githubusercontent.com/suhrig/arriba/6fff196/scripts/convert_fusions_to_vcf.sh > $CONDA_PREFIX/bin/convert_fusions_to_vcf.sh diff --git a/workflow/envs/arriba.yaml b/workflow/envs/arriba.yaml index 861318b7d..87079487a 100644 --- a/workflow/envs/arriba.yaml +++ b/workflow/envs/arriba.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - arriba =2.4 \ No newline at end of file + - arriba =2.5 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f9dedb4e7..d3580eef2 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -11,7 +11,7 @@ samples = ( pd.read_csv( config["samples"], sep="\t", - dtype={"sample_name": str, "group": str}, + dtype={"sample_name": str, "group": str, "umi_len": "Int64"}, comment="#", ) .set_index("sample_name", drop=False) diff --git a/workflow/rules/fusion_calling.smk b/workflow/rules/fusion_calling.smk index 5a4a66ad5..216fc95c6 100644 --- a/workflow/rules/fusion_calling.smk +++ b/workflow/rules/fusion_calling.smk @@ -1,6 +1,6 @@ module fusion_calling: meta_wrapper: - "v3.13.3/meta/bio/star_arriba" + "v7.1.0/meta/bio/star_arriba" config: config diff --git a/workflow/scripts/split-call-tables.py b/workflow/scripts/split-call-tables.py index 9762db108..c6df1cb78 100644 --- a/workflow/scripts/split-call-tables.py +++ b/workflow/scripts/split-call-tables.py @@ -173,7 +173,6 @@ def variants( self.pos = pos self._variants = self._load_variants() for variant in self._variants: - print(variant.pos) if variant.pos == pos and variant.alts[0] == alt: yield variant if variant.pos > pos: From f567489c6f81eb9643da2d3c89f13ea1d775f403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Mon, 21 Jul 2025 08:35:30 +0200 Subject: [PATCH 3/4] remove print cmd --- workflow/scripts/split-call-tables.py | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/scripts/split-call-tables.py b/workflow/scripts/split-call-tables.py index c6df1cb78..9a249c261 100644 --- a/workflow/scripts/split-call-tables.py +++ b/workflow/scripts/split-call-tables.py @@ -195,7 +195,6 @@ def annotate_row(self, row): ) def _load_variants(self): - print(self.pos) return self.bcf.fetch(str(self.contig), self.pos - 1, self.end) @property From c9369b765c4730324b9de75d8204561b867851e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Wed, 6 Aug 2025 15:12:53 +0200 Subject: [PATCH 4/4] Apply suggestions from code review Co-authored-by: Felix Wiegand --- workflow/envs/predictosaurus.yaml | 2 +- workflow/rules/impact_graphs.smk | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/envs/predictosaurus.yaml b/workflow/envs/predictosaurus.yaml index 8147dba59..986ceb09f 100644 --- a/workflow/envs/predictosaurus.yaml +++ b/workflow/envs/predictosaurus.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - predictosaurus =0.2.10 \ No newline at end of file + - predictosaurus =0.4.0 \ No newline at end of file diff --git a/workflow/rules/impact_graphs.smk b/workflow/rules/impact_graphs.smk index d9fe6d02c..6adbfdd58 100644 --- a/workflow/rules/impact_graphs.smk +++ b/workflow/rules/impact_graphs.smk @@ -62,7 +62,7 @@ rule predictosaurus_process: gff="resources/annotation.gff3", graph="results/impact_graphs/{group}.freebayes.graphs.duckdb", output: - "results/impact_graphs/{group}.paths.duckdb", + "results/impact_graphs/{group}.scores.duckdb", log: "logs/predictosaurus/process/{group}.log", conda: @@ -84,5 +84,5 @@ rule predictosaurus_plot: "../envs/predictosaurus.yaml" shell: """ - predictosaurus plot --input {input} --format html --output {output} 2> {log} + predictosaurus plot --input {input} --output {output} 2> {log} """