diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..45cb1077
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at odiogosilva@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/README.md b/README.md
index 5273a198..9ddffcce 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,7 @@
[](http://flowcraft.readthedocs.io/en/latest/?badge=latest)
[](https://badge.fury.io/py/flowcraft)
[](https://anaconda.org/bioconda/flowcraft)
+[](https://gitter.im/flowcraft-community/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
diff --git a/changelog.md b/changelog.md
index 9e897e58..5660db84 100644
--- a/changelog.md
+++ b/changelog.md
@@ -7,6 +7,9 @@
- Fix bug in `downsample_fastq` where the resulting output files was not being saved in the `results` directory
- Fix bug in `downsample_fastq` where the output files were being saved as broken symlinks when there was no
down-sampling occurring
+- Moved `renamePE_samtoolsFASTQ.py` from `flowcraft/bin/` to the `flowcraft/templates` folder and updated it to
+python3 - fix issue #219
+
### Minor/Other changes
diff --git a/flowcraft/bin/renamePE_samtoolsFASTQ.py b/flowcraft/bin/renamePE_samtoolsFASTQ.py
deleted file mode 100755
index 052046d1..00000000
--- a/flowcraft/bin/renamePE_samtoolsFASTQ.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/usr/bin/env python2
-
-#TODO - change to py3
-# -*- coding: utf-8 -*-
-
-"""
-renamePE_samtoolsFASTQ.py - Rename the fastq headers with PE terminations
-that were not include in samtools fastq command
-
-Copyright (C) 2017 Miguel Machado
-Last modified: January 10, 2017
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-"""
-
-import os
-import sys
-import time
-import argparse
-import itertools
-
-
-version = '0.1'
-
-
-def formartFastqHeaders(in_fastq_1, in_fastq_2, outdir):
- out_fastq_1 = os.path.join(outdir, os.path.splitext(os.path.basename(in_fastq_1))[0] + '.headersRenamed_1.fq')
- out_fastq_2 = os.path.join(outdir, os.path.splitext(os.path.basename(in_fastq_2))[0] + '.headersRenamed_2.fq')
- writer_in_fastq_1 = open(out_fastq_1, 'wt')
- writer_in_fastq_2 = open(out_fastq_2, 'wt')
- outfiles = [out_fastq_1, out_fastq_2]
- with open(in_fastq_1, 'rtU') as reader_in_fastq_1, open(in_fastq_2, 'rtU') as reader_in_fastq_2:
- plus_line = True
- quality_line = True
- number_reads = 0
- for in_1, in_2 in itertools.izip(reader_in_fastq_1, reader_in_fastq_2):
- if len(in_1) > 0:
- in_1 = in_1.splitlines()[0]
- in_2 = in_2.splitlines()[0]
- if in_1.startswith('@') and plus_line and quality_line:
- if in_1 != in_2:
- sys.exit('The PE fastq files are not aligned properly!')
- in_1 += '/1' + '\n'
- in_2 += '/2' + '\n'
- writer_in_fastq_1.write(in_1)
- writer_in_fastq_2.write(in_2)
- plus_line = False
- quality_line = False
- elif in_1.startswith('+') and not plus_line:
- in_1 += '\n'
- writer_in_fastq_1.write(in_1)
- writer_in_fastq_2.write(in_1)
- plus_line = True
- elif plus_line and not quality_line:
- in_1 += '\n'
- in_2 += '\n'
- writer_in_fastq_1.write(in_1)
- writer_in_fastq_2.write(in_2)
- writer_in_fastq_1.flush()
- writer_in_fastq_2.flush()
- number_reads += 1
- quality_line = True
- else:
- in_1 += '\n'
- in_2 += '\n'
- writer_in_fastq_1.write(in_1)
- writer_in_fastq_2.write(in_2)
- return number_reads, outfiles
-
-
-def compressionType(file_to_test):
- magic_dict = {'\x1f\x8b\x08': ['gzip', 'gunzip'], '\x42\x5a\x68': ['bzip2', 'bunzip2']}
-
- max_len = max(len(x) for x in magic_dict)
-
- with open(file_to_test, 'r') as reader:
- file_start = reader.read(max_len)
-
- for magic, filetype in magic_dict.items():
- if file_start.startswith(magic):
- return filetype
- return None
-
-
-def runTime(start_time):
- end_time = time.time()
- time_taken = end_time - start_time
- hours, rest = divmod(time_taken, 3600)
- minutes, seconds = divmod(rest, 60)
- print 'Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's'
- return time_taken
-
-
-def main():
- parser = argparse.ArgumentParser(prog='renamePE_samtoolsFASTQ.py', description='Rename the fastq headers with PE terminations that were not include in samtools fastq command', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
-
- parser_required = parser.add_argument_group('Required options')
- parser_required.add_argument('-1', '--fastq_1', type=argparse.FileType('r'), metavar='/path/to/input/file_1.fq', help='Uncompressed fastq file containing mate 1 reads', required=True)
- parser_required.add_argument('-2', '--fastq_2', type=argparse.FileType('r'), metavar='/path/to/input/file_2.fq', help='Uncompressed fastq file containing mate 2 reads', required=True)
-
- parser_optional_general = parser.add_argument_group('General facultative options')
- parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/output/directory/', help='Path for output directory', required=False, default='.')
-
- args = parser.parse_args()
-
- print '\n' + 'STARTING renamePE_samtoolsFASTQ.py' + '\n'
- start_time = time.time()
-
- fastq_files = [os.path.abspath(args.fastq_1.name), os.path.abspath(args.fastq_2.name)]
-
- print 'Check if files are compressed' + '\n'
- for fastq in fastq_files:
- if compressionType(fastq) is not None:
- sys.exit('Compressed fastq files found')
-
- outdir = os.path.abspath(args.outdir)
- if not os.path.isdir(outdir):
- os.makedirs(outdir)
-
- print 'Renaming fastq headers' + '\n'
- number_reads, outfiles = formartFastqHeaders(fastq_files[0], fastq_files[1], outdir)
-
- print 'It was written ' + str(number_reads) + ' read pairs in ' + str(outfiles) + ' files' + '\n'
-
- print '\n' + 'END renamePE_samtoolsFASTQ.py'
- time_taken = runTime(start_time)
- del time_taken
-
-
-if __name__ == "__main__":
- main()
\ No newline at end of file
diff --git a/flowcraft/generator/components/mapping.py b/flowcraft/generator/components/mapping.py
index becca7bb..f178ab11 100644
--- a/flowcraft/generator/components/mapping.py
+++ b/flowcraft/generator/components/mapping.py
@@ -106,7 +106,8 @@ def __init__(self, **kwargs):
}
self.status_channels = [
- "retrieve_mapped"
+ "retrieve_mapped",
+ "renamePE"
]
diff --git a/flowcraft/generator/components/metagenomics.py b/flowcraft/generator/components/metagenomics.py
index 692a940c..5cd6b377 100644
--- a/flowcraft/generator/components/metagenomics.py
+++ b/flowcraft/generator/components/metagenomics.py
@@ -475,6 +475,7 @@ def __init__(self, **kwargs):
self.status_channels = [
"remove_host",
+ "renamePE",
"report_remove_host"
]
diff --git a/flowcraft/generator/templates/remove_host.nf b/flowcraft/generator/templates/remove_host.nf
index f070c546..b7312454 100644
--- a/flowcraft/generator/templates/remove_host.nf
+++ b/flowcraft/generator/templates/remove_host.nf
@@ -17,7 +17,7 @@ process remove_host_{{ pid }} {
val clear from checkpointClear_{{ pid }}
output:
- set sample_id , file("${sample_id}*.headersRenamed_*.fq.gz") into {{ output_channel }}
+ set sample_id , file("${sample_id}_unmapped_*.fq") into OUT_remove_host_{{ pid }}
set sample_id, file("*_bowtie2.log") into into_json_{{ pid }}
{% with task_name="remove_host" %}
{%- include "compiler_channels.txt" ignore missing -%}
@@ -36,11 +36,6 @@ process remove_host_{{ pid }} {
rm ${sample_id}_samtools.bam
- renamePE_samtoolsFASTQ.py -1 ${sample_id}_unmapped_1.fq -2 ${sample_id}_unmapped_2.fq
-
- gzip *.headersRenamed_*.fq
- rm *.fq
-
if [ "$clear" = "true" ];
then
work_regex=".*/work/.{2}/.{30}/.*"
@@ -58,6 +53,28 @@ process remove_host_{{ pid }} {
}
+process renamePE_{{ pid }} {
+
+ // Send POST request to platform
+ {% include "post.txt" ignore missing %}
+
+ tag { sample_id }
+ publishDir 'results/mapping/remove_host_{{ pid }}/'
+
+ input:
+ set sample_id, file(fastq_pair) from OUT_remove_host_{{ pid }}
+
+ output:
+ set sample_id , file("*.headersRenamed_*.fq.gz") into {{ output_channel }}
+ {% with task_name="renamePE" %}
+ {%- include "compiler_channels.txt" ignore missing -%}
+ {% endwith %}
+
+ script:
+ template "renamePE_samtoolsFASTQ.py"
+
+}
+
process report_remove_host_{{ pid }} {
diff --git a/flowcraft/generator/templates/retrieve_mapped.nf b/flowcraft/generator/templates/retrieve_mapped.nf
index 4a906c11..be371b17 100644
--- a/flowcraft/generator/templates/retrieve_mapped.nf
+++ b/flowcraft/generator/templates/retrieve_mapped.nf
@@ -4,13 +4,12 @@ process retrieve_mapped_{{ pid }} {
{% include "post.txt" ignore missing %}
tag { sample_id }
- publishDir 'results/mapping/retrieve_mapped_{{ pid }}/'
input:
set sample_id, file(bam) from {{ input_channel }}
output:
- set sample_id , file("*.headersRenamed_*.fq.gz") into {{ output_channel }}
+ set sample_id , file("*_mapped_*.fq") into OUT_retrieve_mapped_{{ pid }}
{% with task_name="retrieve_mapped" %}
{%- include "compiler_channels.txt" ignore missing -%}
{% endwith %}
@@ -25,12 +24,28 @@ process retrieve_mapped_{{ pid }} {
rm ${sample_id}_samtools.bam
- renamePE_samtoolsFASTQ.py -1 ${sample_id}_mapped_1.fq -2 ${sample_id}_mapped_2.fq
+ """
+}
- gzip *.headersRenamed_*.fq
+process renamePE_{{ pid }} {
+
+ tag { sample_id }
+ publishDir 'results/mapping/retrieve_mapped_{{ pid }}/'
+
+ {% include "post.txt" ignore missing %}
+
+ input:
+ set sample_id, file(fastq_pair) from OUT_retrieve_mapped_{{ pid }}
+
+ output:
+ set sample_id , file("*.headersRenamed_*.fq.gz") into {{ output_channel }}
+ {% with task_name="renamePE" %}
+ {%- include "compiler_channels.txt" ignore missing -%}
+ {% endwith %}
+
+ script:
+ template "renamePE_samtoolsFASTQ.py"
- rm *.fq
- """
}
{{ forks }}
\ No newline at end of file
diff --git a/flowcraft/templates/renamePE_samtoolsFASTQ.py b/flowcraft/templates/renamePE_samtoolsFASTQ.py
new file mode 100755
index 00000000..a9da48d5
--- /dev/null
+++ b/flowcraft/templates/renamePE_samtoolsFASTQ.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import bz2
+import gzip
+import zipfile
+from flowcraft_utils.flowcraft_base import get_logger, MainWrapper
+
+logger = get_logger(__file__)
+
+"""
+Purpose
+-------
+
+This module renames the fastq headers with PE terminations
+that were not include in samtools fastq command
+
+
+Expected input
+--------------
+
+The following variables are expected whether using NextFlow or the
+:py:func:`main` executor.
+
+- ``sample_id``: Sample Identification string.
+ - e.g.: ``'SampleA'``
+- ``fastq_pair`` : Pair of FastQ file paths.
+ - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'`
+
+Generated output
+----------------
+- ``fastq_pair`` : Pair of FastQ file paths with rename headers.
+ - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'`
+
+Code documentation
+------------------
+
+"""
+
+__version__ = "1.0.1"
+__build__ = "09.09.2019"
+__template__ = "retrieved_mapped-nf"
+
+if __file__.endswith(".command.sh"):
+ SAMPLE_ID = '$sample_id'
+ FASTQ_PAIR = '$fastq_pair'.split()
+ logger.debug("Running {} with parameters:".format(
+ os.path.basename(__file__)))
+ logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
+ logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
+
+
+COPEN = {
+ "gz": gzip.open,
+ "bz2": bz2.open,
+ "zip": zipfile.ZipFile
+}
+
+MAGIC_DICT = {
+ b"\\x1f\\x8b\\x08": "gz",
+ b"\\x42\\x5a\\x68": "bz2",
+ b"\\x50\\x4b\\x03\\x04": "zip"
+}
+
+
+def guess_file_compression(file_path, magic_dict=None):
+ """Guesses the compression of an input file.
+
+ This function guesses the compression of a given file by checking for
+ a binary signature at the beginning of the file. These signatures are
+ stored in the :py:data:`MAGIC_DICT` dictionary. The supported compression
+ formats are gzip, bzip2 and zip. If none of the signatures in this
+ dictionary are found at the beginning of the file, it returns ``None``.
+
+ Parameters
+ ----------
+ file_path : str
+ Path to input file.
+ magic_dict : dict, optional
+ Dictionary containing the signatures of the compression types. The
+ key should be the binary signature and the value should be the
+ compression format. If left ``None``, it falls back to
+ :py:data:`MAGIC_DICT`.
+
+ Returns
+ -------
+ file_type : str or None
+ If a compression type is detected, returns a string with the format.
+ If not, returns ``None``.
+ """
+
+ if not magic_dict:
+ magic_dict = MAGIC_DICT
+
+ max_len = max(len(x) for x in magic_dict)
+
+ with open(file_path, "rb") as f:
+ file_start = f.read(max_len)
+
+ logger.debug("Binary signature start: {}".format(file_start))
+
+ for magic, file_type in magic_dict.items():
+ if file_start.startswith(magic):
+ return file_type
+
+ return None
+
+
+def formartFastqHeaders(sample_name, in_fastq_1, in_fastq_2):
+ out_fastq_1 = os.path.join(os.getcwd(), sample_name + '.headersRenamed_1.fq')
+ out_fastq_2 = os.path.join(os.getcwd(), sample_name + '.headersRenamed_2.fq')
+
+ writer_in_fastq_1 = open(out_fastq_1, 'wt')
+ writer_in_fastq_2 = open(out_fastq_2, 'wt')
+
+ outfiles = [out_fastq_1, out_fastq_2]
+
+ with open(in_fastq_1, 'r') as reader_in_fastq_1, open(in_fastq_2, 'r') as reader_in_fastq_2:
+ plus_line = True
+ quality_line = True
+ number_reads = 0
+ for in_1, in_2 in zip(reader_in_fastq_1, reader_in_fastq_2):
+ if len(in_1) > 0:
+ in_1 = in_1.splitlines()[0]
+ in_2 = in_2.splitlines()[0]
+
+ if in_1.startswith('@') and plus_line and quality_line:
+ if in_1 != in_2:
+ sys.exit('The PE fastq files are not aligned properly!')
+ in_1 += '/1' + '\\n'
+ in_2 += '/2' + '\\n'
+ writer_in_fastq_1.write(in_1)
+ writer_in_fastq_2.write(in_2)
+ plus_line = False
+ quality_line = False
+ elif in_1.startswith('+') and not plus_line:
+ in_1 += '\\n'
+ writer_in_fastq_1.write(in_1)
+ writer_in_fastq_2.write(in_1)
+ plus_line = True
+ elif plus_line and not quality_line:
+ in_1 += '\\n'
+ in_2 += '\\n'
+ writer_in_fastq_1.write(in_1)
+ writer_in_fastq_2.write(in_2)
+ writer_in_fastq_1.flush()
+ writer_in_fastq_2.flush()
+ number_reads += 1
+ quality_line = True
+ else:
+ in_1 += '\\n'
+ in_2 += '\\n'
+ writer_in_fastq_1.write(in_1)
+ writer_in_fastq_2.write(in_2)
+
+ writer_in_fastq_1.close()
+ writer_in_fastq_2.close()
+
+ return number_reads, outfiles
+
+
+def main(sample_id, fastq_files):
+
+ logger.info("STARTING renamePE_samtoolsFASTQ.py")
+
+ file_objects = []
+
+ for fastq in fastq_files:
+
+ logger.info("Processing file {}".format(fastq))
+
+ logger.info("[{}] Guessing file compression".format(fastq))
+ ftype = guess_file_compression(fastq)
+
+ # This can guess the compression of gz, bz2 and zip. If it cannot
+ # find the compression type, it tries to open a regular file.
+ if ftype:
+ logger.info("[{}] Found file compression: {}".format(fastq, ftype))
+ file_objects.append(COPEN[ftype](fastq, "rt"))
+
+ else:
+ logger.info("[{}] File compression not found. Assuming an uncompressed file".format(fastq))
+ file_objects.append(fastq)
+
+ logger.info('Renaming fastq headers')
+ number_reads, outfiles = formartFastqHeaders(sample_id, file_objects[0], file_objects[1])
+
+ logger.info('{} read pairs were written in {} and {}. Compressing...'.format(number_reads, outfiles[0], outfiles[1]))
+
+ # compress outfiles
+ for file in outfiles:
+ with open(file, 'rb') as f_in:
+ f_out = gzip.open(file + '.gz', 'wb')
+ f_out.writelines(f_in)
+ f_out.close()
+ logger.info('DONE')
+
+ os.remove(outfiles[0])
+ os.remove((outfiles[1]))
+
+
+if __name__ == "__main__":
+ main(SAMPLE_ID, FASTQ_PAIR)