snakemake-workflows · FelixMoelder · Feb 21, 2025 · Jul 4, 2025 · Jul 21, 2025 · Aug 6, 2025
diff --git a/config/config.yaml b/config/config.yaml
@@ -77,6 +77,9 @@ mutational_signatures:
     # select events (callsets, defined under calling/fdr-control/events) to evaluate
     - some_id
 
+impact_graphs:
+  activate: false
+
 # Sets the minimum average coverage for each gene.
 # Genes with lower average coverage will not be concidered in gene coverage datavzrd report
 # If not present min_avg_coverage will be set to 0 rendering all genes.

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -44,6 +44,7 @@ include: "rules/fusion_calling.smk"
 include: "rules/testcase.smk"
 include: "rules/population.smk"
 include: "rules/mutational_signatures.smk"
+include: "rules/impact_graphs.smk"
 
 
 batches = "all"

diff --git a/workflow/envs/arriba.post-deploy.sh b/workflow/envs/arriba.post-deploy.sh
diff --git a/workflow/envs/arriba.yaml b/workflow/envs/arriba.yaml
@@ -2,4 +2,4 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - arriba =2.4
+  - arriba =2.5
diff --git a/workflow/envs/predictosaurus.yaml b/workflow/envs/predictosaurus.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - predictosaurus =0.4.0
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -11,7 +11,7 @@ samples = (
     pd.read_csv(
         config["samples"],
         sep="\t",
-        dtype={"sample_name": str, "group": str},
+        dtype={"sample_name": str, "group": str, "umi_len": "Int64"},
         comment="#",
     )
     .set_index("sample_name", drop=False)
@@ -248,6 +248,7 @@ def get_final_output(wildcards):
                 )
     final_output.extend(get_mutational_burden_targets())
     final_output.extend(get_mutational_signature_targets())
+    final_output.extend(get_impact_graph_targets())
 
     if is_activated("population/db"):
         final_output.append(lookup(dpath="population/db/path", within=config))
@@ -689,17 +690,25 @@ def get_mutational_burden_targets():
 def get_mutational_signature_targets():
     mutational_signature_targets = []
     if is_activated("mutational_signatures"):
-        for group in variants_groups:
-            mutational_signature_targets.extend(
-                expand(
-                    "results/plots/mutational_signatures/{group}.{event}.html",
-                    group=variants_groups,
-                    event=config["mutational_signatures"].get("events"),
-                )
+        mutational_signature_targets.extend(
+            expand(
+                "results/plots/mutational_signatures/{group}.{event}.html",
+                group=variants_groups,
+                event=config["mutational_signatures"].get("events"),
             )
+        )
     return mutational_signature_targets
 
 
+def get_impact_graph_targets():
+    impact_graph_targets = []
+    if is_activated("impact_graphs"):
+        impact_graph_targets.extend(
+            expand("results/impact_graphs/{group}", group=variants_groups)
+        )
+    return impact_graph_targets
+
+
 def get_scattered_calls(ext="bcf"):
     def inner(wildcards):
         caller = "arriba" if wildcards.calling_type == "fusions" else variant_caller

diff --git a/workflow/rules/fusion_calling.smk b/workflow/rules/fusion_calling.smk
@@ -1,6 +1,6 @@
 module fusion_calling:
     meta_wrapper:
-        "v3.13.3/meta/bio/star_arriba"
+        "v7.1.0/meta/bio/star_arriba"
     config:
         config
 

diff --git a/workflow/rules/impact_graphs.smk b/workflow/rules/impact_graphs.smk
@@ -0,0 +1,88 @@
+rule gather_calls:
+    input:
+        calls=gather.calling("results/calls/{{group}}.{{caller}}.{scatteritem}.bcf"),
+    output:
+        "results/calls/{group}/{caller}.bcf",
+    log:
+        "logs/gather-calls/{group}/{caller}.log",
+    params:
+        uncompressed_bcf=False,
+        extra="",
+    threads: 4
+    resources:
+        mem_mb=10,
+    wrapper:
+        "v5.8.2/bio/bcftools/concat"
+
+
+def get_predictosaurus_build_aux(wildcards):
+    aux = []
+    for sample in get_group_samples(wildcards.group):
+        aux.append(
+            "{alias}=results/observations/{group}/{sample}.{caller}.all.bcf".format(
+                alias=samples.loc[samples["sample_name"] == sample]["alias"].iloc[0],
+                caller=wildcards.caller,
+                group=wildcards.group,
+                sample=sample,
+            )
+        )
+    return " ".join(aux)
+
+
+def get_all_group_observations(wildcards):
+    return expand(
+        "results/observations/{group}/{sample}.{caller}.all.bcf",
+        caller=wildcards.caller,
+        group=wildcards.group,
+        sample=get_group_samples(wildcards.group),
+    )
+
+
+rule predictosaurus_build:
+    input:
+        calls="results/calls/{group}.{caller}.bcf",
+        obs=get_all_group_observations,
+    output:
+        "results/impact_graphs/{group}.{caller}.graphs.duckdb",
+    params:
+        obs_aux=get_predictosaurus_build_aux,
+    log:
+        "logs/predictosaurus/build/{group}_{caller}.log",
+    conda:
+        "../envs/predictosaurus.yaml"
+    shell:
+        """
+        predictosaurus build -v --calls {input.calls} --observations {params.obs_aux} --output {output} 2> {log}
+        """
+
+
+rule predictosaurus_process:
+    input:
+        ref=genome,
+        gff="resources/annotation.gff3",
+        graph="results/impact_graphs/{group}.freebayes.graphs.duckdb",
+    output:
+        "results/impact_graphs/{group}.scores.duckdb",
+    log:
+        "logs/predictosaurus/process/{group}.log",
+    conda:
+        "../envs/predictosaurus.yaml"
+    shell:
+        """
+        predictosaurus process -v --features {input.gff} --reference {input.ref} --graph {input.graph} --output {output} 2> {log}
+        """
+
+
+rule predictosaurus_plot:
+    input:
+        "results/impact_graphs/{group}.paths.duckdb",
+    output:
+        "results/impact_graphs/{group}",
+    log:
+        "logs/predictosaurus/plot/{group}.log",
+    conda:
+        "../envs/predictosaurus.yaml"
+    shell:
+        """
+        predictosaurus plot --input {input} --output {output} 2> {log}
+        """
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
@@ -61,14 +61,16 @@ rule get_known_variants:
 
 rule get_annotation:
     output:
-        "resources/annotation.gtf",
+        "resources/annotation.{ext}",
     params:
         species=config["ref"]["species"],
         release=config["ref"]["release"],
         build=config["ref"]["build"],
         flavor="",  # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
+    wildcard_constraints:
+        ext="gft|gff3",
     log:
-        "logs/get_annotation.log",
+        "logs/get_annotation_{ext}.log",
     cache: "omit-software"  # save space and time with between workflow caching (see docs)
     wrapper:
         "v2.3.2/bio/reference/ensembl-annotation"

diff --git a/workflow/rules/table.smk b/workflow/rules/table.smk
@@ -3,7 +3,7 @@ rule vembrane_table:
         bcf="results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.normal-probs.bcf",
         scenario="results/scenarios/{group}.yaml",
     output:
-        bcf="results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv",
+        tsv="results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv",
     conda:
         "../envs/vembrane.yaml"
     params:
@@ -12,7 +12,7 @@ rule vembrane_table:
         "logs/vembrane-table/{group}.{event}.{calling_type}.log",
     shell:
         'vembrane table --header "{params.config[header]}" "{params.config[expr]}" '
-        "{input.bcf} > {output.bcf} 2> {log}"
+        "{input.bcf} > {output.tsv} 2> {log}"
 
 
 rule tsv_to_excel:

diff --git a/workflow/rules/testcase.smk b/workflow/rules/testcase.smk
@@ -1,3 +1,41 @@
+rule sort_observations:
+    input:
+        "results/observations/{group}/{sample}.{caller}.{scatteritem}.bcf",
+    output:
+        temp("results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf"),
+    params:
+        # Set to True, in case you want uncompressed BCF output
+        uncompressed_bcf=False,
+        # Extra arguments
+        extras="",
+    log:
+        "logs/bcf-sort/{group}/{sample}.{caller}.{scatteritem}.log",
+    resources:
+        mem_mb=8000,
+    wrapper:
+        "v5.5.0/bio/bcftools/sort"
+
+
+rule index_observations:
+    input:
+        "results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf",
+    output:
+        temp(
+            "results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf.csi"
+        ),
+    params:
+        # Set to True, in case you want uncompressed BCF output
+        uncompressed_bcf=False,
+        # Extra arguments
+        extras="",
+    log:
+        "logs/bcf-index/{group}/{sample}.{caller}.{scatteritem}.log",
+    resources:
+        mem_mb=8000,
+    wrapper:
+        "v5.5.0/bio/bcftools/index"
+
+
 rule gather_observations:
     input:
         calls=gather.calling(

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -193,6 +193,14 @@ properties:
         type: boolean
     required:
       - activate
+
+  impact_graphs:
+    type: object
+    properties:
+      activate:
+        type: boolean
+    required:
+      - activate
 
 
   calling:

diff --git a/workflow/scripts/split-call-tables.py b/workflow/scripts/split-call-tables.py
@@ -173,7 +173,6 @@ def variants(
             self.pos = pos
             self._variants = self._load_variants()
         for variant in self._variants:
-            print(variant.pos)
             if variant.pos == pos and variant.alts[0] == alt:
                 yield variant
             if variant.pos > pos:
@@ -196,7 +195,6 @@ def annotate_row(self, row):
         )
 
     def _load_variants(self):
-        print(self.pos)
         return self.bcf.fetch(str(self.contig), self.pos - 1, self.end)
 
     @property