diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 7d41d51..d53ed0b 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,4 @@ -sample,fastq_1,fastq_2,read_structure +sample,fastq_1,fastq_2,read_structure,fastq_umi SAMPLE_DUPLEX_SEQ,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,10M1S+T 10M1S+T SAMPLE_SINGLE_UMI,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz,12M+T +T +SAMPLE_UMI_FASTQ,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R3_001.fastq.gz,+T +T +M,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index e0c6509..50dbf68 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -73,7 +73,9 @@ def validate_and_transform(self, row): self._validate_first(row) self._validate_second(row) self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col], row[self._second_col])) + self._seen.add( + (row[self._sample_col], row[self._first_col], row[self._second_col]) + ) self.modified.append(row) def _validate_sample(self, row): @@ -95,14 +97,14 @@ def _validate_second(self, row): def _validate_pair(self, row): """Assert that read pairs have the same file extension. Report pair status.""" assert ( - Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:] + Path(row[self._first_col]).suffixes[-2:] + == Path(row[self._second_col]).suffixes[-2:] ), "FASTQ pairs must have the same file extensions." def _validate_read_structure(self, row): """Assert that the second FASTQ entry has the right format if it exists.""" - assert len(row[self._read_structure_col].split(' ')) == 2, ( - "Two read structures must be provided." - ) + n_structures = len(row[self._read_structure_col].split(" ")) + assert 2 <= n_structures <= 3, "Two read structures must be provided." def _validate_fastq_format(self, filename): """Assert that a given filename has one of the expected FASTQ extensions.""" @@ -119,7 +121,9 @@ def validate_unique_samples(self): FASTQ file combination exists. """ - assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique." + assert len(self._seen) == len( + self.modified + ), "The pair of sample name and FASTQ must be unique." if len({pair[0] for pair in self._seen}) < len(self._seen): counts = Counter(pair[0] for pair in self._seen) seen = Counter() @@ -192,7 +196,9 @@ def check_samplesheet(file_in, file_out): reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) # Validate the existence of the expected header columns. if not required_columns.issubset(reader.fieldnames): - logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.") + logger.critical( + f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}." + ) sys.exit(1) # Validate each row. checker = RowChecker() diff --git a/modules/local/align_bam/main.nf b/modules/local/align_bam/main.nf index de6d104..287592f 100644 --- a/modules/local/align_bam/main.nf +++ b/modules/local/align_bam/main.nf @@ -52,7 +52,7 @@ process ALIGN_BAM { """ # The real path to the FASTA - FASTA=`find -L ./ -name "*.amb" | sed 's/.amb//'` + FASTA=`find -L ./ -name "*.amb" | sed -r 's/(.64)?.amb//'` samtools fastq ${samtools_fastq_args} ${unmapped_bam} \\ | bwa mem ${bwa_args} -t $task.cpus -p -K 150000000 -Y \$FASTA - \\ diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index bb11361..cd0ccde 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -35,6 +35,17 @@ def create_fastq_channel(LinkedHashMap row) { if (!file(row.fastq_2).exists()) { exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + if (row.fastq_umi){ + if (!file(row.fastq_umi).exists()) { + exit 1, "ERROR: Please check input samplesheet -> UMI FastQ file is specified in samplesheet does not exist!\n${row.fastq_2}" + } + if ( params.duplex_seq ) { + exit 1, "ERROR: Duplex mode is not compatible with a UMI sequencing file. Please use --duplex_seq false when using a UMI fastq file." + } + fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ] + } else { + fastq_list = [ file(row.fastq_1), file(row.fastq_2) ] + } + fastq_meta = [ meta, fastq_list ] return fastq_meta } diff --git a/workflows/fastquorum.nf b/workflows/fastquorum.nf index f724dfb..e0a4330 100644 --- a/workflows/fastquorum.nf +++ b/workflows/fastquorum.nf @@ -120,25 +120,24 @@ workflow FASTQUORUM { ) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - // // MODULE: Run fgbio FastqToBam // FASTQTOBAM(INPUT_CHECK.out.reads) + ch_versions = ch_versions.mix(FASTQTOBAM.out.versions.first()) // // MODULE: Align with bwa mem // grouped_sort = true ALIGN_RAW_BAM(FASTQTOBAM.out.bam, ch_ref_index_dir, grouped_sort) + ch_versions = ch_versions.mix(ALIGN_RAW_BAM.out.versions) // // MODULE: Run fgbio GroupReadsByUmi // GROUPREADSBYUMI(ALIGN_RAW_BAM.out.bam, groupreadsbyumi_strategy, params.groupreadsbyumi_edits) + ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions.first()) // TODO: duplex_seq can be inferred from the read structure, but that's out of scope for now if (params.duplex_seq) { @@ -146,19 +145,24 @@ workflow FASTQUORUM { // MODULE: Run fgbio CallDuplexConsensusReads // CALLDDUPLEXCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq) + ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first()) // // MODULE: Run fgbio CollecDuplexSeqMetrics // COLLECTDUPLEXSEQMETRICS(GROUPREADSBYUMI.out.bam) + ch_versions = ch_versions.mix(COLLECTDUPLEXSEQMETRICS.out.versions.first()) // Add the consensus BAM to the channel for downstream processing CALLDDUPLEXCONSENSUSREADS.out.bam.set { ch_consensus_bam } + ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first()) + } else { // // MODULE: Run fgbio CallMolecularConsensusReads // CALLMOLECULARCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq) + ch_versions = ch_versions.mix(CALLMOLECULARCONSENSUSREADS.out.versions.first()) // Add the consensus BAM to the channel for downstream processing CALLMOLECULARCONSENSUSREADS.out.bam.set { ch_consensus_bam } @@ -168,11 +172,17 @@ workflow FASTQUORUM { // MODULE: Align with bwa mem // ALIGN_CONSENSUS_BAM(ch_consensus_bam, ch_ref_index_dir, false) + ch_versions = ch_versions.mix(ALIGN_CONSENSUS_BAM.out.versions.first()) // // MODULE: Run fgbio FilterConsensusReads // FILTERCONSENSUSREADS(ALIGN_CONSENSUS_BAM.out.bam, ch_ref_fasta, filter_min_reads, params.filter_min_baseq, params.filter_max_base_error_rate) + ch_versions = ch_versions.mix(FILTERCONSENSUSREADS.out.versions.first()) + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) // // MODULE: MultiQC