nf-core · adamrtalbot · Nov 2, 2022 · Nov 2, 2022 · Nov 2, 2022 · Nov 2, 2022
diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,4 @@
-sample,fastq_1,fastq_2,read_structure
+sample,fastq_1,fastq_2,read_structure,fastq_umi
 SAMPLE_DUPLEX_SEQ,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,10M1S+T 10M1S+T
 SAMPLE_SINGLE_UMI,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz,12M+T +T
+SAMPLE_UMI_FASTQ,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R3_001.fastq.gz,+T +T +M,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -73,7 +73,9 @@ def validate_and_transform(self, row):
         self._validate_first(row)
         self._validate_second(row)
         self._validate_pair(row)
-        self._seen.add((row[self._sample_col], row[self._first_col], row[self._second_col]))
+        self._seen.add(
+            (row[self._sample_col], row[self._first_col], row[self._second_col])
+        )
         self.modified.append(row)
 
     def _validate_sample(self, row):
@@ -95,14 +97,14 @@ def _validate_second(self, row):
     def _validate_pair(self, row):
         """Assert that read pairs have the same file extension. Report pair status."""
         assert (
-            Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:]
+            Path(row[self._first_col]).suffixes[-2:]
+            == Path(row[self._second_col]).suffixes[-2:]
         ), "FASTQ pairs must have the same file extensions."
 
     def _validate_read_structure(self, row):
         """Assert that the second FASTQ entry has the right format if it exists."""
-        assert len(row[self._read_structure_col].split(' ')) == 2, (
-            "Two read structures must be provided."
-        )
+        n_structures = len(row[self._read_structure_col].split(" "))
+        assert 2 <= n_structures <= 3, "Two read structures must be provided."
 
     def _validate_fastq_format(self, filename):
         """Assert that a given filename has one of the expected FASTQ extensions."""
@@ -119,7 +121,9 @@ def validate_unique_samples(self):
         FASTQ file combination exists.
 
         """
-        assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique."
+        assert len(self._seen) == len(
+            self.modified
+        ), "The pair of sample name and FASTQ must be unique."
         if len({pair[0] for pair in self._seen}) < len(self._seen):
             counts = Counter(pair[0] for pair in self._seen)
             seen = Counter()
@@ -192,7 +196,9 @@ def check_samplesheet(file_in, file_out):
         reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
         # Validate the existence of the expected header columns.
         if not required_columns.issubset(reader.fieldnames):
-            logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.")
+            logger.critical(
+                f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}."
+            )
             sys.exit(1)
         # Validate each row.
         checker = RowChecker()

diff --git a/modules/local/align_bam/main.nf b/modules/local/align_bam/main.nf
@@ -52,7 +52,7 @@ process ALIGN_BAM {
 
     """
     # The real path to the FASTA
-    FASTA=`find -L ./ -name "*.amb" | sed 's/.amb//'`
+    FASTA=`find -L ./ -name "*.amb" | sed -r 's/(.64)?.amb//'`
 
     samtools fastq ${samtools_fastq_args} ${unmapped_bam} \\
         | bwa mem ${bwa_args} -t $task.cpus -p -K 150000000 -Y \$FASTA - \\

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -35,6 +35,17 @@ def create_fastq_channel(LinkedHashMap row) {
     if (!file(row.fastq_2).exists()) {
         exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
     }
-    fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
+    if (row.fastq_umi){
+        if (!file(row.fastq_umi).exists()) {
+            exit 1, "ERROR: Please check input samplesheet -> UMI FastQ file is specified in samplesheet does not exist!\n${row.fastq_2}"
+        }
+        if ( params.duplex_seq ) {
+            exit 1, "ERROR: Duplex mode is not compatible with a UMI sequencing file. Please use --duplex_seq false when using a UMI fastq file."
+        }
+        fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ]
+    } else {
+        fastq_list = [ file(row.fastq_1), file(row.fastq_2) ]
+    }
+    fastq_meta = [ meta, fastq_list ]
     return fastq_meta
 }
diff --git a/workflows/fastquorum.nf b/workflows/fastquorum.nf
@@ -120,45 +120,49 @@ workflow FASTQUORUM {
     )
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
-    CUSTOM_DUMPSOFTWAREVERSIONS (
-        ch_versions.unique().collectFile(name: 'collated_versions.yml')
-    )
-
     //
     // MODULE: Run fgbio FastqToBam
     //
     FASTQTOBAM(INPUT_CHECK.out.reads)
+    ch_versions = ch_versions.mix(FASTQTOBAM.out.versions.first())
 
     //
     // MODULE: Align with bwa mem
     //
     grouped_sort = true
     ALIGN_RAW_BAM(FASTQTOBAM.out.bam, ch_ref_index_dir, grouped_sort)
+    ch_versions = ch_versions.mix(ALIGN_RAW_BAM.out.versions)
 
     //
     // MODULE: Run fgbio GroupReadsByUmi
     //
     GROUPREADSBYUMI(ALIGN_RAW_BAM.out.bam, groupreadsbyumi_strategy, params.groupreadsbyumi_edits)
+    ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions.first())
 
     // TODO: duplex_seq can be inferred from the read structure, but that's out of scope for now
     if (params.duplex_seq) {
         //
         // MODULE: Run fgbio CallDuplexConsensusReads
         //
         CALLDDUPLEXCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq)
+        ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first())
 
         //
         // MODULE: Run fgbio CollecDuplexSeqMetrics
         //
         COLLECTDUPLEXSEQMETRICS(GROUPREADSBYUMI.out.bam)
+        ch_versions = ch_versions.mix(COLLECTDUPLEXSEQMETRICS.out.versions.first())
 
         // Add the consensus BAM to the channel for downstream processing
         CALLDDUPLEXCONSENSUSREADS.out.bam.set { ch_consensus_bam }
+        ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first())
+
     } else {
         //
         // MODULE: Run fgbio CallMolecularConsensusReads
         //
         CALLMOLECULARCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq)
+        ch_versions = ch_versions.mix(CALLMOLECULARCONSENSUSREADS.out.versions.first())
 
         // Add the consensus BAM to the channel for downstream processing
         CALLMOLECULARCONSENSUSREADS.out.bam.set { ch_consensus_bam }
@@ -168,11 +172,17 @@ workflow FASTQUORUM {
     // MODULE: Align with bwa mem
     //
     ALIGN_CONSENSUS_BAM(ch_consensus_bam, ch_ref_index_dir, false)
+    ch_versions = ch_versions.mix(ALIGN_CONSENSUS_BAM.out.versions.first())
 
     //
     // MODULE: Run fgbio FilterConsensusReads
     //
     FILTERCONSENSUSREADS(ALIGN_CONSENSUS_BAM.out.bam, ch_ref_fasta, filter_min_reads, params.filter_min_baseq, params.filter_max_base_error_rate)
+    ch_versions = ch_versions.mix(FILTERCONSENSUSREADS.out.versions.first())
+
+    CUSTOM_DUMPSOFTWAREVERSIONS (
+        ch_versions.unique().collectFile(name: 'collated_versions.yml')
+    )
 
     //
     // MODULE: MultiQC