Skip to content

Third umi read #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
sample,fastq_1,fastq_2,read_structure
sample,fastq_1,fastq_2,read_structure,fastq_umi
SAMPLE_DUPLEX_SEQ,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,10M1S+T 10M1S+T
SAMPLE_SINGLE_UMI,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz,12M+T +T
SAMPLE_UMI_FASTQ,/path/to/fastq/files/AEG588A1_S2_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S2_L002_R3_001.fastq.gz,+T +T +M,/path/to/fastq/files/AEG588A1_S2_L002_R2_001.fastq.gz
20 changes: 13 additions & 7 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ def validate_and_transform(self, row):
self._validate_first(row)
self._validate_second(row)
self._validate_pair(row)
self._seen.add((row[self._sample_col], row[self._first_col], row[self._second_col]))
self._seen.add(
(row[self._sample_col], row[self._first_col], row[self._second_col])
)
self.modified.append(row)

def _validate_sample(self, row):
Expand All @@ -95,14 +97,14 @@ def _validate_second(self, row):
def _validate_pair(self, row):
"""Assert that read pairs have the same file extension. Report pair status."""
assert (
Path(row[self._first_col]).suffixes[-2:] == Path(row[self._second_col]).suffixes[-2:]
Path(row[self._first_col]).suffixes[-2:]
== Path(row[self._second_col]).suffixes[-2:]
), "FASTQ pairs must have the same file extensions."

def _validate_read_structure(self, row):
"""Assert that the second FASTQ entry has the right format if it exists."""
assert len(row[self._read_structure_col].split(' ')) == 2, (
"Two read structures must be provided."
)
n_structures = len(row[self._read_structure_col].split(" "))
assert 2 <= n_structures <= 3, "Two read structures must be provided."

def _validate_fastq_format(self, filename):
"""Assert that a given filename has one of the expected FASTQ extensions."""
Expand All @@ -119,7 +121,9 @@ def validate_unique_samples(self):
FASTQ file combination exists.

"""
assert len(self._seen) == len(self.modified), "The pair of sample name and FASTQ must be unique."
assert len(self._seen) == len(
self.modified
), "The pair of sample name and FASTQ must be unique."
if len({pair[0] for pair in self._seen}) < len(self._seen):
counts = Counter(pair[0] for pair in self._seen)
seen = Counter()
Expand Down Expand Up @@ -192,7 +196,9 @@ def check_samplesheet(file_in, file_out):
reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
# Validate the existence of the expected header columns.
if not required_columns.issubset(reader.fieldnames):
logger.critical(f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}.")
logger.critical(
f"The sample sheet **must** contain the column headers: {', '.join(required_columns)}."
)
sys.exit(1)
# Validate each row.
checker = RowChecker()
Expand Down
2 changes: 1 addition & 1 deletion modules/local/align_bam/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ process ALIGN_BAM {

"""
# The real path to the FASTA
FASTA=`find -L ./ -name "*.amb" | sed 's/.amb//'`
FASTA=`find -L ./ -name "*.amb" | sed -r 's/(.64)?.amb//'`

samtools fastq ${samtools_fastq_args} ${unmapped_bam} \\
| bwa mem ${bwa_args} -t $task.cpus -p -K 150000000 -Y \$FASTA - \\
Expand Down
13 changes: 12 additions & 1 deletion subworkflows/local/input_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ def create_fastq_channel(LinkedHashMap row) {
if (!file(row.fastq_2).exists()) {
exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}"
}
fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ]
if (row.fastq_umi){
if (!file(row.fastq_umi).exists()) {
exit 1, "ERROR: Please check input samplesheet -> UMI FastQ file is specified in samplesheet does not exist!\n${row.fastq_2}"
}
if ( params.duplex_seq ) {
exit 1, "ERROR: Duplex mode is not compatible with a UMI sequencing file. Please use --duplex_seq false when using a UMI fastq file."
}
fastq_list = [ file(row.fastq_1), file(row.fastq_2), file(row.fastq_umi) ]
} else {
fastq_list = [ file(row.fastq_1), file(row.fastq_2) ]
}
fastq_meta = [ meta, fastq_list ]
return fastq_meta
}
18 changes: 14 additions & 4 deletions workflows/fastquorum.nf
Original file line number Diff line number Diff line change
Expand Up @@ -120,45 +120,49 @@ workflow FASTQUORUM {
)
ch_versions = ch_versions.mix(FASTQC.out.versions.first())

CUSTOM_DUMPSOFTWAREVERSIONS (
ch_versions.unique().collectFile(name: 'collated_versions.yml')
)

//
// MODULE: Run fgbio FastqToBam
//
FASTQTOBAM(INPUT_CHECK.out.reads)
ch_versions = ch_versions.mix(FASTQTOBAM.out.versions.first())

//
// MODULE: Align with bwa mem
//
grouped_sort = true
ALIGN_RAW_BAM(FASTQTOBAM.out.bam, ch_ref_index_dir, grouped_sort)
ch_versions = ch_versions.mix(ALIGN_RAW_BAM.out.versions)

//
// MODULE: Run fgbio GroupReadsByUmi
//
GROUPREADSBYUMI(ALIGN_RAW_BAM.out.bam, groupreadsbyumi_strategy, params.groupreadsbyumi_edits)
ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions.first())

// TODO: duplex_seq can be inferred from the read structure, but that's out of scope for now
if (params.duplex_seq) {
//
// MODULE: Run fgbio CallDuplexConsensusReads
//
CALLDDUPLEXCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq)
ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first())

//
// MODULE: Run fgbio CollecDuplexSeqMetrics
//
COLLECTDUPLEXSEQMETRICS(GROUPREADSBYUMI.out.bam)
ch_versions = ch_versions.mix(COLLECTDUPLEXSEQMETRICS.out.versions.first())

// Add the consensus BAM to the channel for downstream processing
CALLDDUPLEXCONSENSUSREADS.out.bam.set { ch_consensus_bam }
ch_versions = ch_versions.mix(CALLDDUPLEXCONSENSUSREADS.out.versions.first())

} else {
//
// MODULE: Run fgbio CallMolecularConsensusReads
//
CALLMOLECULARCONSENSUSREADS(GROUPREADSBYUMI.out.bam, call_min_reads, params.call_min_baseq)
ch_versions = ch_versions.mix(CALLMOLECULARCONSENSUSREADS.out.versions.first())

// Add the consensus BAM to the channel for downstream processing
CALLMOLECULARCONSENSUSREADS.out.bam.set { ch_consensus_bam }
Expand All @@ -168,11 +172,17 @@ workflow FASTQUORUM {
// MODULE: Align with bwa mem
//
ALIGN_CONSENSUS_BAM(ch_consensus_bam, ch_ref_index_dir, false)
ch_versions = ch_versions.mix(ALIGN_CONSENSUS_BAM.out.versions.first())

//
// MODULE: Run fgbio FilterConsensusReads
//
FILTERCONSENSUSREADS(ALIGN_CONSENSUS_BAM.out.bam, ch_ref_fasta, filter_min_reads, params.filter_min_baseq, params.filter_max_base_error_rate)
ch_versions = ch_versions.mix(FILTERCONSENSUSREADS.out.versions.first())

CUSTOM_DUMPSOFTWAREVERSIONS (
ch_versions.unique().collectFile(name: 'collated_versions.yml')
)

//
// MODULE: MultiQC
Expand Down