diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..45cb1077 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at odiogosilva@gmail.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/README.md b/README.md index 5273a198..9ddffcce 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ [![Documentation Status](https://readthedocs.org/projects/flowcraft/badge/?version=latest)](http://flowcraft.readthedocs.io/en/latest/?badge=latest) [![PyPI version](https://badge.fury.io/py/flowcraft.svg)](https://badge.fury.io/py/flowcraft) [![Anaconda-Server Badge](https://anaconda.org/bioconda/flowcraft/badges/version.svg)](https://anaconda.org/bioconda/flowcraft) +[![Gitter](https://badges.gitter.im/flowcraft-community/community.svg)](https://gitter.im/flowcraft-community/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)

nextflow_logo diff --git a/changelog.md b/changelog.md index 9e897e58..5660db84 100644 --- a/changelog.md +++ b/changelog.md @@ -7,6 +7,9 @@ - Fix bug in `downsample_fastq` where the resulting output files was not being saved in the `results` directory - Fix bug in `downsample_fastq` where the output files were being saved as broken symlinks when there was no down-sampling occurring +- Moved `renamePE_samtoolsFASTQ.py` from `flowcraft/bin/` to the `flowcraft/templates` folder and updated it to +python3 - fix issue #219 + ### Minor/Other changes diff --git a/flowcraft/bin/renamePE_samtoolsFASTQ.py b/flowcraft/bin/renamePE_samtoolsFASTQ.py deleted file mode 100755 index 052046d1..00000000 --- a/flowcraft/bin/renamePE_samtoolsFASTQ.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python2 - -#TODO - change to py3 -# -*- coding: utf-8 -*- - -""" -renamePE_samtoolsFASTQ.py - Rename the fastq headers with PE terminations -that were not include in samtools fastq command - -Copyright (C) 2017 Miguel Machado -Last modified: January 10, 2017 -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. -You should have received a copy of the GNU General Public License -along with this program. If not, see . -""" - -import os -import sys -import time -import argparse -import itertools - - -version = '0.1' - - -def formartFastqHeaders(in_fastq_1, in_fastq_2, outdir): - out_fastq_1 = os.path.join(outdir, os.path.splitext(os.path.basename(in_fastq_1))[0] + '.headersRenamed_1.fq') - out_fastq_2 = os.path.join(outdir, os.path.splitext(os.path.basename(in_fastq_2))[0] + '.headersRenamed_2.fq') - writer_in_fastq_1 = open(out_fastq_1, 'wt') - writer_in_fastq_2 = open(out_fastq_2, 'wt') - outfiles = [out_fastq_1, out_fastq_2] - with open(in_fastq_1, 'rtU') as reader_in_fastq_1, open(in_fastq_2, 'rtU') as reader_in_fastq_2: - plus_line = True - quality_line = True - number_reads = 0 - for in_1, in_2 in itertools.izip(reader_in_fastq_1, reader_in_fastq_2): - if len(in_1) > 0: - in_1 = in_1.splitlines()[0] - in_2 = in_2.splitlines()[0] - if in_1.startswith('@') and plus_line and quality_line: - if in_1 != in_2: - sys.exit('The PE fastq files are not aligned properly!') - in_1 += '/1' + '\n' - in_2 += '/2' + '\n' - writer_in_fastq_1.write(in_1) - writer_in_fastq_2.write(in_2) - plus_line = False - quality_line = False - elif in_1.startswith('+') and not plus_line: - in_1 += '\n' - writer_in_fastq_1.write(in_1) - writer_in_fastq_2.write(in_1) - plus_line = True - elif plus_line and not quality_line: - in_1 += '\n' - in_2 += '\n' - writer_in_fastq_1.write(in_1) - writer_in_fastq_2.write(in_2) - writer_in_fastq_1.flush() - writer_in_fastq_2.flush() - number_reads += 1 - quality_line = True - else: - in_1 += '\n' - in_2 += '\n' - writer_in_fastq_1.write(in_1) - writer_in_fastq_2.write(in_2) - return number_reads, outfiles - - -def compressionType(file_to_test): - magic_dict = {'\x1f\x8b\x08': ['gzip', 'gunzip'], '\x42\x5a\x68': ['bzip2', 'bunzip2']} - - max_len = max(len(x) for x in magic_dict) - - with open(file_to_test, 'r') as reader: - file_start = reader.read(max_len) - - for magic, filetype in magic_dict.items(): - if file_start.startswith(magic): - return filetype - return None - - -def runTime(start_time): - end_time = time.time() - time_taken = end_time - start_time - hours, rest = divmod(time_taken, 3600) - minutes, seconds = divmod(rest, 60) - print 'Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's' - return time_taken - - -def main(): - parser = argparse.ArgumentParser(prog='renamePE_samtoolsFASTQ.py', description='Rename the fastq headers with PE terminations that were not include in samtools fastq command', formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) - - parser_required = parser.add_argument_group('Required options') - parser_required.add_argument('-1', '--fastq_1', type=argparse.FileType('r'), metavar='/path/to/input/file_1.fq', help='Uncompressed fastq file containing mate 1 reads', required=True) - parser_required.add_argument('-2', '--fastq_2', type=argparse.FileType('r'), metavar='/path/to/input/file_2.fq', help='Uncompressed fastq file containing mate 2 reads', required=True) - - parser_optional_general = parser.add_argument_group('General facultative options') - parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/output/directory/', help='Path for output directory', required=False, default='.') - - args = parser.parse_args() - - print '\n' + 'STARTING renamePE_samtoolsFASTQ.py' + '\n' - start_time = time.time() - - fastq_files = [os.path.abspath(args.fastq_1.name), os.path.abspath(args.fastq_2.name)] - - print 'Check if files are compressed' + '\n' - for fastq in fastq_files: - if compressionType(fastq) is not None: - sys.exit('Compressed fastq files found') - - outdir = os.path.abspath(args.outdir) - if not os.path.isdir(outdir): - os.makedirs(outdir) - - print 'Renaming fastq headers' + '\n' - number_reads, outfiles = formartFastqHeaders(fastq_files[0], fastq_files[1], outdir) - - print 'It was written ' + str(number_reads) + ' read pairs in ' + str(outfiles) + ' files' + '\n' - - print '\n' + 'END renamePE_samtoolsFASTQ.py' - time_taken = runTime(start_time) - del time_taken - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/flowcraft/generator/components/mapping.py b/flowcraft/generator/components/mapping.py index becca7bb..f178ab11 100644 --- a/flowcraft/generator/components/mapping.py +++ b/flowcraft/generator/components/mapping.py @@ -106,7 +106,8 @@ def __init__(self, **kwargs): } self.status_channels = [ - "retrieve_mapped" + "retrieve_mapped", + "renamePE" ] diff --git a/flowcraft/generator/components/metagenomics.py b/flowcraft/generator/components/metagenomics.py index 692a940c..5cd6b377 100644 --- a/flowcraft/generator/components/metagenomics.py +++ b/flowcraft/generator/components/metagenomics.py @@ -475,6 +475,7 @@ def __init__(self, **kwargs): self.status_channels = [ "remove_host", + "renamePE", "report_remove_host" ] diff --git a/flowcraft/generator/templates/remove_host.nf b/flowcraft/generator/templates/remove_host.nf index f070c546..b7312454 100644 --- a/flowcraft/generator/templates/remove_host.nf +++ b/flowcraft/generator/templates/remove_host.nf @@ -17,7 +17,7 @@ process remove_host_{{ pid }} { val clear from checkpointClear_{{ pid }} output: - set sample_id , file("${sample_id}*.headersRenamed_*.fq.gz") into {{ output_channel }} + set sample_id , file("${sample_id}_unmapped_*.fq") into OUT_remove_host_{{ pid }} set sample_id, file("*_bowtie2.log") into into_json_{{ pid }} {% with task_name="remove_host" %} {%- include "compiler_channels.txt" ignore missing -%} @@ -36,11 +36,6 @@ process remove_host_{{ pid }} { rm ${sample_id}_samtools.bam - renamePE_samtoolsFASTQ.py -1 ${sample_id}_unmapped_1.fq -2 ${sample_id}_unmapped_2.fq - - gzip *.headersRenamed_*.fq - rm *.fq - if [ "$clear" = "true" ]; then work_regex=".*/work/.{2}/.{30}/.*" @@ -58,6 +53,28 @@ process remove_host_{{ pid }} { } +process renamePE_{{ pid }} { + + // Send POST request to platform + {% include "post.txt" ignore missing %} + + tag { sample_id } + publishDir 'results/mapping/remove_host_{{ pid }}/' + + input: + set sample_id, file(fastq_pair) from OUT_remove_host_{{ pid }} + + output: + set sample_id , file("*.headersRenamed_*.fq.gz") into {{ output_channel }} + {% with task_name="renamePE" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + script: + template "renamePE_samtoolsFASTQ.py" + +} + process report_remove_host_{{ pid }} { diff --git a/flowcraft/generator/templates/retrieve_mapped.nf b/flowcraft/generator/templates/retrieve_mapped.nf index 4a906c11..be371b17 100644 --- a/flowcraft/generator/templates/retrieve_mapped.nf +++ b/flowcraft/generator/templates/retrieve_mapped.nf @@ -4,13 +4,12 @@ process retrieve_mapped_{{ pid }} { {% include "post.txt" ignore missing %} tag { sample_id } - publishDir 'results/mapping/retrieve_mapped_{{ pid }}/' input: set sample_id, file(bam) from {{ input_channel }} output: - set sample_id , file("*.headersRenamed_*.fq.gz") into {{ output_channel }} + set sample_id , file("*_mapped_*.fq") into OUT_retrieve_mapped_{{ pid }} {% with task_name="retrieve_mapped" %} {%- include "compiler_channels.txt" ignore missing -%} {% endwith %} @@ -25,12 +24,28 @@ process retrieve_mapped_{{ pid }} { rm ${sample_id}_samtools.bam - renamePE_samtoolsFASTQ.py -1 ${sample_id}_mapped_1.fq -2 ${sample_id}_mapped_2.fq + """ +} - gzip *.headersRenamed_*.fq +process renamePE_{{ pid }} { + + tag { sample_id } + publishDir 'results/mapping/retrieve_mapped_{{ pid }}/' + + {% include "post.txt" ignore missing %} + + input: + set sample_id, file(fastq_pair) from OUT_retrieve_mapped_{{ pid }} + + output: + set sample_id , file("*.headersRenamed_*.fq.gz") into {{ output_channel }} + {% with task_name="renamePE" %} + {%- include "compiler_channels.txt" ignore missing -%} + {% endwith %} + + script: + template "renamePE_samtoolsFASTQ.py" - rm *.fq - """ } {{ forks }} \ No newline at end of file diff --git a/flowcraft/templates/renamePE_samtoolsFASTQ.py b/flowcraft/templates/renamePE_samtoolsFASTQ.py new file mode 100755 index 00000000..a9da48d5 --- /dev/null +++ b/flowcraft/templates/renamePE_samtoolsFASTQ.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +# -*- coding: utf-8 -*- + +import os +import sys +import bz2 +import gzip +import zipfile +from flowcraft_utils.flowcraft_base import get_logger, MainWrapper + +logger = get_logger(__file__) + +""" +Purpose +------- + +This module renames the fastq headers with PE terminations +that were not include in samtools fastq command + + +Expected input +-------------- + +The following variables are expected whether using NextFlow or the +:py:func:`main` executor. + +- ``sample_id``: Sample Identification string. + - e.g.: ``'SampleA'`` +- ``fastq_pair`` : Pair of FastQ file paths. + - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'` + +Generated output +---------------- +- ``fastq_pair`` : Pair of FastQ file paths with rename headers. + - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'` + +Code documentation +------------------ + +""" + +__version__ = "1.0.1" +__build__ = "09.09.2019" +__template__ = "retrieved_mapped-nf" + +if __file__.endswith(".command.sh"): + SAMPLE_ID = '$sample_id' + FASTQ_PAIR = '$fastq_pair'.split() + logger.debug("Running {} with parameters:".format( + os.path.basename(__file__))) + logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID)) + logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR)) + + +COPEN = { + "gz": gzip.open, + "bz2": bz2.open, + "zip": zipfile.ZipFile +} + +MAGIC_DICT = { + b"\\x1f\\x8b\\x08": "gz", + b"\\x42\\x5a\\x68": "bz2", + b"\\x50\\x4b\\x03\\x04": "zip" +} + + +def guess_file_compression(file_path, magic_dict=None): + """Guesses the compression of an input file. + + This function guesses the compression of a given file by checking for + a binary signature at the beginning of the file. These signatures are + stored in the :py:data:`MAGIC_DICT` dictionary. The supported compression + formats are gzip, bzip2 and zip. If none of the signatures in this + dictionary are found at the beginning of the file, it returns ``None``. + + Parameters + ---------- + file_path : str + Path to input file. + magic_dict : dict, optional + Dictionary containing the signatures of the compression types. The + key should be the binary signature and the value should be the + compression format. If left ``None``, it falls back to + :py:data:`MAGIC_DICT`. + + Returns + ------- + file_type : str or None + If a compression type is detected, returns a string with the format. + If not, returns ``None``. + """ + + if not magic_dict: + magic_dict = MAGIC_DICT + + max_len = max(len(x) for x in magic_dict) + + with open(file_path, "rb") as f: + file_start = f.read(max_len) + + logger.debug("Binary signature start: {}".format(file_start)) + + for magic, file_type in magic_dict.items(): + if file_start.startswith(magic): + return file_type + + return None + + +def formartFastqHeaders(sample_name, in_fastq_1, in_fastq_2): + out_fastq_1 = os.path.join(os.getcwd(), sample_name + '.headersRenamed_1.fq') + out_fastq_2 = os.path.join(os.getcwd(), sample_name + '.headersRenamed_2.fq') + + writer_in_fastq_1 = open(out_fastq_1, 'wt') + writer_in_fastq_2 = open(out_fastq_2, 'wt') + + outfiles = [out_fastq_1, out_fastq_2] + + with open(in_fastq_1, 'r') as reader_in_fastq_1, open(in_fastq_2, 'r') as reader_in_fastq_2: + plus_line = True + quality_line = True + number_reads = 0 + for in_1, in_2 in zip(reader_in_fastq_1, reader_in_fastq_2): + if len(in_1) > 0: + in_1 = in_1.splitlines()[0] + in_2 = in_2.splitlines()[0] + + if in_1.startswith('@') and plus_line and quality_line: + if in_1 != in_2: + sys.exit('The PE fastq files are not aligned properly!') + in_1 += '/1' + '\\n' + in_2 += '/2' + '\\n' + writer_in_fastq_1.write(in_1) + writer_in_fastq_2.write(in_2) + plus_line = False + quality_line = False + elif in_1.startswith('+') and not plus_line: + in_1 += '\\n' + writer_in_fastq_1.write(in_1) + writer_in_fastq_2.write(in_1) + plus_line = True + elif plus_line and not quality_line: + in_1 += '\\n' + in_2 += '\\n' + writer_in_fastq_1.write(in_1) + writer_in_fastq_2.write(in_2) + writer_in_fastq_1.flush() + writer_in_fastq_2.flush() + number_reads += 1 + quality_line = True + else: + in_1 += '\\n' + in_2 += '\\n' + writer_in_fastq_1.write(in_1) + writer_in_fastq_2.write(in_2) + + writer_in_fastq_1.close() + writer_in_fastq_2.close() + + return number_reads, outfiles + + +def main(sample_id, fastq_files): + + logger.info("STARTING renamePE_samtoolsFASTQ.py") + + file_objects = [] + + for fastq in fastq_files: + + logger.info("Processing file {}".format(fastq)) + + logger.info("[{}] Guessing file compression".format(fastq)) + ftype = guess_file_compression(fastq) + + # This can guess the compression of gz, bz2 and zip. If it cannot + # find the compression type, it tries to open a regular file. + if ftype: + logger.info("[{}] Found file compression: {}".format(fastq, ftype)) + file_objects.append(COPEN[ftype](fastq, "rt")) + + else: + logger.info("[{}] File compression not found. Assuming an uncompressed file".format(fastq)) + file_objects.append(fastq) + + logger.info('Renaming fastq headers') + number_reads, outfiles = formartFastqHeaders(sample_id, file_objects[0], file_objects[1]) + + logger.info('{} read pairs were written in {} and {}. Compressing...'.format(number_reads, outfiles[0], outfiles[1])) + + # compress outfiles + for file in outfiles: + with open(file, 'rb') as f_in: + f_out = gzip.open(file + '.gz', 'wb') + f_out.writelines(f_in) + f_out.close() + logger.info('DONE') + + os.remove(outfiles[0]) + os.remove((outfiles[1])) + + +if __name__ == "__main__": + main(SAMPLE_ID, FASTQ_PAIR)