From ee1737ec07e1b4e12ac6f36e7d4e97a123b954b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20M=C3=B6lder?= <felix.moelder@uk-essen.de>
Date: Fri, 21 Feb 2025 09:40:30 +0100
Subject: [PATCH 1/4] feat: impact graphs

---
 config/config.yaml                  |  3 +
 workflow/Snakefile                  |  1 +
 workflow/envs/predictosaurus.yaml   |  5 ++
 workflow/rules/common.smk           | 23 +++++---
 workflow/rules/impact_graphs.smk    | 88 +++++++++++++++++++++++++++++
 workflow/rules/ref.smk              |  6 +-
 workflow/rules/table.smk            |  4 +-
 workflow/rules/testcase.smk         | 38 +++++++++++++
 workflow/schemas/config.schema.yaml |  8 +++
 9 files changed, 165 insertions(+), 11 deletions(-)
 create mode 100644 workflow/envs/predictosaurus.yaml
 create mode 100644 workflow/rules/impact_graphs.smk

diff --git a/config/config.yaml b/config/config.yaml
index 3368a5678..1fbe77ce6 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -77,6 +77,9 @@ mutational_signatures:
     # select events (callsets, defined under calling/fdr-control/events) to evaluate
     - some_id
 
+impact_graphs:
+  activate: false
+
 # Sets the minimum average coverage for each gene.
 # Genes with lower average coverage will not be concidered in gene coverage datavzrd report
 # If not present min_avg_coverage will be set to 0 rendering all genes.
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 7335685ea..dae2f5d3c 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -40,6 +40,7 @@ include: "rules/fusion_calling.smk"
 include: "rules/testcase.smk"
 include: "rules/population.smk"
 include: "rules/mutational_signatures.smk"
+include: "rules/impact_graphs.smk"
 
 
 batches = "all"
diff --git a/workflow/envs/predictosaurus.yaml b/workflow/envs/predictosaurus.yaml
new file mode 100644
index 000000000..8147dba59
--- /dev/null
+++ b/workflow/envs/predictosaurus.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - predictosaurus =0.2.10
\ No newline at end of file
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index a2dec48e6..9f5fc18a0 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -248,6 +248,7 @@ def get_final_output(wildcards):
                 )
     final_output.extend(get_mutational_burden_targets())
     final_output.extend(get_mutational_signature_targets())
+    final_output.extend(get_impact_graph_targets())
 
     if is_activated("population/db"):
         final_output.append(lookup(dpath="population/db/path", within=config))
@@ -683,17 +684,25 @@ def get_mutational_burden_targets():
 def get_mutational_signature_targets():
     mutational_signature_targets = []
     if is_activated("mutational_signatures"):
-        for group in variants_groups:
-            mutational_signature_targets.extend(
-                expand(
-                    "results/plots/mutational_signatures/{group}.{event}.html",
-                    group=variants_groups,
-                    event=config["mutational_signatures"].get("events"),
-                )
+        mutational_signature_targets.extend(
+            expand(
+                "results/plots/mutational_signatures/{group}.{event}.html",
+                group=variants_groups,
+                event=config["mutational_signatures"].get("events"),
             )
+        )
     return mutational_signature_targets
 
 
+def get_impact_graph_targets():
+    impact_graph_targets = []
+    if is_activated("impact_graphs"):
+        impact_graph_targets.extend(
+            expand("results/impact_graphs/{group}", group=variants_groups)
+        )
+    return impact_graph_targets
+
+
 def get_scattered_calls(ext="bcf"):
     def inner(wildcards):
         caller = "arriba" if wildcards.calling_type == "fusions" else variant_caller
diff --git a/workflow/rules/impact_graphs.smk b/workflow/rules/impact_graphs.smk
new file mode 100644
index 000000000..d9fe6d02c
--- /dev/null
+++ b/workflow/rules/impact_graphs.smk
@@ -0,0 +1,88 @@
+rule gather_calls:
+    input:
+        calls=gather.calling("results/calls/{{group}}.{{caller}}.{scatteritem}.bcf"),
+    output:
+        "results/calls/{group}/{caller}.bcf",
+    log:
+        "logs/gather-calls/{group}/{caller}.log",
+    params:
+        uncompressed_bcf=False,
+        extra="",
+    threads: 4
+    resources:
+        mem_mb=10,
+    wrapper:
+        "v5.8.2/bio/bcftools/concat"
+
+
+def get_predictosaurus_build_aux(wildcards):
+    aux = []
+    for sample in get_group_samples(wildcards.group):
+        aux.append(
+            "{alias}=results/observations/{group}/{sample}.{caller}.all.bcf".format(
+                alias=samples.loc[samples["sample_name"] == sample]["alias"].iloc[0],
+                caller=wildcards.caller,
+                group=wildcards.group,
+                sample=sample,
+            )
+        )
+    return " ".join(aux)
+
+
+def get_all_group_observations(wildcards):
+    return expand(
+        "results/observations/{group}/{sample}.{caller}.all.bcf",
+        caller=wildcards.caller,
+        group=wildcards.group,
+        sample=get_group_samples(wildcards.group),
+    )
+
+
+rule predictosaurus_build:
+    input:
+        calls="results/calls/{group}.{caller}.bcf",
+        obs=get_all_group_observations,
+    output:
+        "results/impact_graphs/{group}.{caller}.graphs.duckdb",
+    params:
+        obs_aux=get_predictosaurus_build_aux,
+    log:
+        "logs/predictosaurus/build/{group}_{caller}.log",
+    conda:
+        "../envs/predictosaurus.yaml"
+    shell:
+        """
+        predictosaurus build -v --calls {input.calls} --observations {params.obs_aux} --output {output} 2> {log}
+        """
+
+
+rule predictosaurus_process:
+    input:
+        ref=genome,
+        gff="resources/annotation.gff3",
+        graph="results/impact_graphs/{group}.freebayes.graphs.duckdb",
+    output:
+        "results/impact_graphs/{group}.paths.duckdb",
+    log:
+        "logs/predictosaurus/process/{group}.log",
+    conda:
+        "../envs/predictosaurus.yaml"
+    shell:
+        """
+        predictosaurus process -v --features {input.gff} --reference {input.ref} --graph {input.graph} --output {output} 2> {log}
+        """
+
+
+rule predictosaurus_plot:
+    input:
+        "results/impact_graphs/{group}.paths.duckdb",
+    output:
+        "results/impact_graphs/{group}",
+    log:
+        "logs/predictosaurus/plot/{group}.log",
+    conda:
+        "../envs/predictosaurus.yaml"
+    shell:
+        """
+        predictosaurus plot --input {input} --format html --output {output} 2> {log}
+        """
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index fb936e153..5bdbd25c6 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -61,14 +61,16 @@ rule get_known_variants:
 
 rule get_annotation:
     output:
-        "resources/annotation.gtf",
+        "resources/annotation.{ext}",
     params:
         species=config["ref"]["species"],
         release=config["ref"]["release"],
         build=config["ref"]["build"],
         flavor="",  # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
+    wildcard_constraints:
+        ext="gft|gff3",
     log:
-        "logs/get_annotation.log",
+        "logs/get_annotation_{ext}.log",
     cache: "omit-software"  # save space and time with between workflow caching (see docs)
     wrapper:
         "v2.3.2/bio/reference/ensembl-annotation"
diff --git a/workflow/rules/table.smk b/workflow/rules/table.smk
index 9b8028bbe..a850abe70 100644
--- a/workflow/rules/table.smk
+++ b/workflow/rules/table.smk
@@ -3,7 +3,7 @@ rule vembrane_table:
         bcf="results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.normal-probs.bcf",
         scenario="results/scenarios/{group}.yaml",
     output:
-        bcf="results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv",
+        tsv="results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv",
     conda:
         "../envs/vembrane.yaml"
     params:
@@ -12,7 +12,7 @@ rule vembrane_table:
         "logs/vembrane-table/{group}.{event}.{calling_type}.log",
     shell:
         'vembrane table --header "{params.config[header]}" "{params.config[expr]}" '
-        "{input.bcf} > {output.bcf} 2> {log}"
+        "{input.bcf} > {output.tsv} 2> {log}"
 
 
 rule tsv_to_excel:
diff --git a/workflow/rules/testcase.smk b/workflow/rules/testcase.smk
index c2ca67620..800ea1160 100644
--- a/workflow/rules/testcase.smk
+++ b/workflow/rules/testcase.smk
@@ -1,3 +1,41 @@
+rule sort_observations:
+    input:
+        "results/observations/{group}/{sample}.{caller}.{scatteritem}.bcf",
+    output:
+        temp("results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf"),
+    params:
+        # Set to True, in case you want uncompressed BCF output
+        uncompressed_bcf=False,
+        # Extra arguments
+        extras="",
+    log:
+        "logs/bcf-sort/{group}/{sample}.{caller}.{scatteritem}.log",
+    resources:
+        mem_mb=8000,
+    wrapper:
+        "v5.5.0/bio/bcftools/sort"
+
+
+rule index_observations:
+    input:
+        "results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf",
+    output:
+        temp(
+            "results/observations/{group}/{sample}.{caller}.{scatteritem}.sorted.bcf.csi"
+        ),
+    params:
+        # Set to True, in case you want uncompressed BCF output
+        uncompressed_bcf=False,
+        # Extra arguments
+        extras="",
+    log:
+        "logs/bcf-index/{group}/{sample}.{caller}.{scatteritem}.log",
+    resources:
+        mem_mb=8000,
+    wrapper:
+        "v5.5.0/bio/bcftools/index"
+
+
 rule gather_observations:
     input:
         calls=gather.calling(
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 0e162f24d..82802a294 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -193,6 +193,14 @@ properties:
         type: boolean
     required:
       - activate
+  
+  impact_graphs:
+    type: object
+    properties:
+      activate:
+        type: boolean
+    required:
+      - activate
 
 
   calling:

From 7472d1c648f56611049fb90c798bc76755345bd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20M=C3=B6lder?= <felix.moelder@uk-essen.de>
Date: Fri, 4 Jul 2025 10:59:51 +0200
Subject: [PATCH 2/4] fix: multiple minor fixes

---
 workflow/envs/arriba.post-deploy.sh   | 4 ----
 workflow/envs/arriba.yaml             | 2 +-
 workflow/rules/common.smk             | 2 +-
 workflow/rules/fusion_calling.smk     | 2 +-
 workflow/scripts/split-call-tables.py | 1 -
 5 files changed, 3 insertions(+), 8 deletions(-)
 delete mode 100644 workflow/envs/arriba.post-deploy.sh

diff --git a/workflow/envs/arriba.post-deploy.sh b/workflow/envs/arriba.post-deploy.sh
deleted file mode 100644
index 8a72a7660..000000000
--- a/workflow/envs/arriba.post-deploy.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!env bash
-
-# Replace convert_fusions_to_vcf.sh script by updated one until new version of arriba is released
-curl https://raw.githubusercontent.com/suhrig/arriba/6fff196/scripts/convert_fusions_to_vcf.sh > $CONDA_PREFIX/bin/convert_fusions_to_vcf.sh
diff --git a/workflow/envs/arriba.yaml b/workflow/envs/arriba.yaml
index 861318b7d..87079487a 100644
--- a/workflow/envs/arriba.yaml
+++ b/workflow/envs/arriba.yaml
@@ -2,4 +2,4 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - arriba =2.4
\ No newline at end of file
+  - arriba =2.5
\ No newline at end of file
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index f9dedb4e7..d3580eef2 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -11,7 +11,7 @@ samples = (
     pd.read_csv(
         config["samples"],
         sep="\t",
-        dtype={"sample_name": str, "group": str},
+        dtype={"sample_name": str, "group": str, "umi_len": "Int64"},
         comment="#",
     )
     .set_index("sample_name", drop=False)
diff --git a/workflow/rules/fusion_calling.smk b/workflow/rules/fusion_calling.smk
index 5a4a66ad5..216fc95c6 100644
--- a/workflow/rules/fusion_calling.smk
+++ b/workflow/rules/fusion_calling.smk
@@ -1,6 +1,6 @@
 module fusion_calling:
     meta_wrapper:
-        "v3.13.3/meta/bio/star_arriba"
+        "v7.1.0/meta/bio/star_arriba"
     config:
         config
 
diff --git a/workflow/scripts/split-call-tables.py b/workflow/scripts/split-call-tables.py
index 9762db108..c6df1cb78 100644
--- a/workflow/scripts/split-call-tables.py
+++ b/workflow/scripts/split-call-tables.py
@@ -173,7 +173,6 @@ def variants(
             self.pos = pos
             self._variants = self._load_variants()
         for variant in self._variants:
-            print(variant.pos)
             if variant.pos == pos and variant.alts[0] == alt:
                 yield variant
             if variant.pos > pos:

From f567489c6f81eb9643da2d3c89f13ea1d775f403 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20M=C3=B6lder?= <felix.moelder@uk-essen.de>
Date: Mon, 21 Jul 2025 08:35:30 +0200
Subject: [PATCH 3/4] remove print cmd

---
 workflow/scripts/split-call-tables.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflow/scripts/split-call-tables.py b/workflow/scripts/split-call-tables.py
index c6df1cb78..9a249c261 100644
--- a/workflow/scripts/split-call-tables.py
+++ b/workflow/scripts/split-call-tables.py
@@ -195,7 +195,6 @@ def annotate_row(self, row):
         )
 
     def _load_variants(self):
-        print(self.pos)
         return self.bcf.fetch(str(self.contig), self.pos - 1, self.end)
 
     @property

From c9369b765c4730324b9de75d8204561b867851e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20M=C3=B6lder?= <felix.moelder@uk-essen.de>
Date: Wed, 6 Aug 2025 15:12:53 +0200
Subject: [PATCH 4/4] Apply suggestions from code review

Co-authored-by: Felix Wiegand <fxwiegand@wgdnet.de>
---
 workflow/envs/predictosaurus.yaml | 2 +-
 workflow/rules/impact_graphs.smk  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflow/envs/predictosaurus.yaml b/workflow/envs/predictosaurus.yaml
index 8147dba59..986ceb09f 100644
--- a/workflow/envs/predictosaurus.yaml
+++ b/workflow/envs/predictosaurus.yaml
@@ -2,4 +2,4 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - predictosaurus =0.2.10
\ No newline at end of file
+  - predictosaurus =0.4.0
\ No newline at end of file
diff --git a/workflow/rules/impact_graphs.smk b/workflow/rules/impact_graphs.smk
index d9fe6d02c..6adbfdd58 100644
--- a/workflow/rules/impact_graphs.smk
+++ b/workflow/rules/impact_graphs.smk
@@ -62,7 +62,7 @@ rule predictosaurus_process:
         gff="resources/annotation.gff3",
         graph="results/impact_graphs/{group}.freebayes.graphs.duckdb",
     output:
-        "results/impact_graphs/{group}.paths.duckdb",
+        "results/impact_graphs/{group}.scores.duckdb",
     log:
         "logs/predictosaurus/process/{group}.log",
     conda:
@@ -84,5 +84,5 @@ rule predictosaurus_plot:
         "../envs/predictosaurus.yaml"
     shell:
         """
-        predictosaurus plot --input {input} --format html --output {output} 2> {log}
+        predictosaurus plot --input {input} --output {output} 2> {log}
         """