diff --git a/.gitignore b/.gitignore index cc05205..c55e1a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ logs .snakemake site +__pycache__ output .tests/illumnia_demux/dry_run_out .tests/dry_run_out diff --git a/docs/install.md b/docs/install.md index c56df9b..986b86d 100644 --- a/docs/install.md +++ b/docs/install.md @@ -37,23 +37,14 @@ Biowulf uses environmental modules to control software. After executing the abov > [+] Loading singularity 4.X.X on cnXXXX ```bash title="Bigsky" -source /data/openomics/bin/dependencies.sh` +source /data/openomics/bin/source_weave.sh ``` -Bigsky uses spack to load modules so a consolidated conda environment with snakemake is activated: +Bigsky uses the same lua module load system that biowulf uses for weave. ```bash title="dependencies.sh" -if [ ! -x "$(command -v "snakemake")" ]; then - source /gs1/apps/user/rmlspack/share/spack/setup-env.sh - export PS1="${PS1:-}" - spack load -r miniconda3@4.11.0/y4vyh4u - source activate snakemake7-19-1 -fi -# Add this folder to $PATH -export PATH="/data/openomics/bin:${PATH}" -# Add different pipelines to $PATH -export PATH="/data/openomics/prod/rna-seek/latest:${PATH}" -export PATH="/data/openomics/prod/metavirs/latest:${PATH}" +module load snakemake/7.22.0-ufanewz +pip install -r /data/openomics/prod/weave/latest/requirements.txt ``` While, singularity is installed to the **BigSky** system and available upon login. @@ -73,4 +64,4 @@ cd weave # git repository root -s .tests/illumnia_demux \ -o .tests/illumnia_demux/dry_run_out \ --local --dry-run /opt2/.tests/illumnia_demux -``` \ No newline at end of file +``` diff --git a/scripts/config.py b/scripts/config.py index cc4565b..1091f63 100644 --- a/scripts/config.py +++ b/scripts/config.py @@ -23,7 +23,7 @@ def get_current_server(): # biowulf hostnames re_biowulf_head = (r"biowulf\.nih\.gov", "biowulf") re_biowulf_compute = (r"cn\d{4}", "biowulf") - + # skyline hostnames re_skyline_head = (r"ai-hpc(submit|n)(\d+)?", "skyline") re_skyline_compute = (r"ai-hpc(submit|n)(\d+)?", "skyline") @@ -48,7 +48,7 @@ def get_current_server(): FRCE_PATH = "COVID-19_Consortium" -# ~~~ labkey configurations ~~~ +# ~~~ labkey configurations ~~~ CONTEXT_PATH = "labkey" LABKEY_CONFIGS = { "bigsky": {"domain": BIGSKY_DEV, "container_path": BIGSKY_PATH, "context_path": CONTEXT_PATH, "use_ssl": True}, @@ -56,7 +56,7 @@ def get_current_server(): } -# ~~~ snakemake configurations ~~~ +# ~~~ snakemake configurations ~~~ illumina_pipelines = defaultdict(lambda: Path(Path(__file__).parent.parent, "workflow", "Snakefile").resolve()) # can add support for NextSeq2k and bclconvert here SNAKEFILE = { @@ -69,7 +69,7 @@ def get_current_server(): def get_resource_config(): - """Return a dictionary containing server specific references utilized in + """Return a dictionary containing server specific references utilized in the workflow for directories or reference files. Returns: @@ -131,7 +131,7 @@ def get_bigsky_seq_dirs(): Returns: (list): list of `pathlib.Path`s of all sequencing directories on bigsky server """ - top_dir = Path("/gs1/RTS/NextGen/SequencerRuns/") + top_dir = Path("/data/rml_ngs/SequencerRuns/") transfer_breadcrumb = "RTAComplete.txt" if not top_dir.exists(): return None @@ -141,7 +141,7 @@ def get_bigsky_seq_dirs(): for this_child_elem in this_dir.iterdir(): try: elem_checks = [ - this_child_elem.is_dir(), + this_child_elem.is_dir(), Path(this_child_elem, transfer_breadcrumb).exists(), check_access(this_child_elem, R_OK) ] @@ -155,13 +155,13 @@ def get_bigsky_seq_dirs(): def get_tmp_dir(host): TMP_CONFIGS = { 'skyline': {'user': '/data/scratch/$USER/$SLURM_JOBID', 'global': '/data/scratch/$USER/' + str(uuid4())}, - 'bigsky': {'user': '/gs1/Scratch/$USER/$SLURM_JOBID', 'global': '/gs1/Scratch/$USER/' + str(uuid4())}, + 'bigsky': {'user': '/data/scratch/$USER/$SLURM_JOBID', 'global': '/data/scratch/$USER/' + str(uuid4())}, 'biowulf': {'user': '/lscratch/$SLURM_JOBID', 'global': '/tmp/$USER/' + str(uuid4())} } this_tmp = TMP_CONFIGS[host]['user'] - # this directory, if it does not exist, + # this directory, if it does not exist, if Path(this_tmp).parents[0].exists(): return this_tmp else: @@ -170,7 +170,7 @@ def get_tmp_dir(host): DIRECTORY_CONFIGS = { "bigsky": { - "seqroot": "/gs1/RTS/NextGen/SequencerRuns/", + "seqroot": "/data/rml_ngs/SequencerRuns", "seq": get_bigsky_seq_dirs(), "profile": Path(Path(__file__).parent.parent, "utils", "profiles", "bigsky").resolve(), }, @@ -222,4 +222,4 @@ def get_tmp_dir(host): "mesaur": "/data/openomics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz", "cynomac": "/data/openomics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz", }, -} \ No newline at end of file +} diff --git a/scripts/files.py b/scripts/files.py index 4cc4127..7e75ee7 100644 --- a/scripts/files.py +++ b/scripts/files.py @@ -13,7 +13,7 @@ def get_all_seq_dirs(top_dir, server): """ - Gather and return all sequencing directories from the `top_dir`. + Gather and return all sequencing directories from the `top_dir`. This is tightly coupled at the moment to the directory that is on RML-BigSky. In the future will need to the take a look at how to do this more generally """ @@ -42,7 +42,7 @@ def valid_run_output(output_directory, dry_run=False): output_directory = Path(output_directory).absolute() if not output_directory.exists(): output_directory.mkdir(parents=True, mode=0o765) - + if not check_access(output_directory, W_OK): raise PermissionError(f'Can not write to output directory {output_directory}') return output_directory @@ -70,7 +70,7 @@ def valid_fasta(suspect): if not is_valid: raise ValueError - + return suspect @@ -147,7 +147,7 @@ def find_demux_dir(run_dir): if len(demux_stat_files) != 1: raise FileNotFoundError - + return Path(demux_stat_files[0], '..').absolute() @@ -160,7 +160,7 @@ def get_run_directories(runids, seq_dir=None, sheetname=None): for secondchild in firstchild.iterdir(): seq_contents.append(secondchild) seq_contents_names = [child for child in map(lambda d: d.name, seq_contents)] - + run_paths, invalid_runs = [], [] run_return = [] for run in runids: @@ -192,7 +192,7 @@ def get_run_directories(runids, seq_dir=None, sheetname=None): sheet = Path(run_p, sheetname).absolute() else: raise FileNotFoundError(f'Run {rid}({run_p}) does not have a find-able sample sheet.') - + this_run_info['samplesheet'] = parse_samplesheet(sheet) this_run_info.update({info.tag: info.text for run in runinfo_xml.getroot() for info in run \ if info.text is not None and info.text.strip() not in ('\n', '')}) @@ -201,5 +201,5 @@ def get_run_directories(runids, seq_dir=None, sheetname=None): if invalid_runs: raise ValueError('Runs entered are invalid (missing sequencing artifacts or directory does not exist): \n' + \ ', '.join(invalid_runs)) - - return run_return \ No newline at end of file + + return run_return diff --git a/scripts/utils.py b/scripts/utils.py index e3427f7..b068196 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -97,7 +97,7 @@ def valid_run_input(run): def exec_snakemake(popen_cmd, local=False, dry_run=False, env=None, cwd=None): - # async execution w/ filter: + # async execution w/ filter: # - https://gist.github.com/DGrady/b713db14a27be0e4e8b2ffc351051c7c # - https://lysator.liu.se/~bellman/download/asyncproc.py # - https://gist.github.com/kalebo/1e085ee36de45ffded7e5d9f857265d0 @@ -113,7 +113,6 @@ def exec_snakemake(popen_cmd, local=False, dry_run=False, env=None, cwd=None): popen_kwargs['cwd'] = cwd else: popen_kwargs['cwd'] = str(Path.cwd()) - parent_jobid = None if local or dry_run: popen_kwargs['env'].update(os.environ) @@ -170,9 +169,7 @@ def get_mods(init=False): mod_cmd = [] if host == 'bigsky': - mod_cmd.append('source /gs1/apps/user/rmlspack/share/spack/setup-env.sh') - mod_cmd.append('spack load miniconda3@4.11.0') - mod_cmd.append('source activate snakemake7-19-1') + mod_cmd.append('module load snakemake/7.22.0-ufanewz') elif host == 'skyline': mod_cmd.append('source /data/openomics/bin/dependencies.sh') elif host == 'biowulf': @@ -220,8 +217,9 @@ def get_mounts(*extras): raise FileNotFoundError(f"Can't mount {str(bind)}, it doesn't exist!") file_to, file_from, mode = str(bind), str(bind), 'rw' mounts.append(file_from + ':' + file_to + ':' + mode) - - mounts.append(r'\$TMPDIR:/tmp:rw') + + if 'TMPDIR' in os.environ: + mounts.append(os.environ['TMPDIR'] + ':/tmp:rw') return ','.join(mounts) @@ -265,13 +263,15 @@ def exec_pipeline(configs, dry_run=False, local=False): top_env['PATH'] = os.environ["PATH"] top_env['SNK_CONFIG'] = str(config_file.absolute()) top_env['SINGULARITY_CACHEDIR'] = str(Path(this_config['out_to'], '.singularity').absolute()) + top_env['SINGULARITY_CONTAINALL'] = '1' + top_env['APPTAINER_CONTAINALL'] = '1' this_cmd = [ - "snakemake", "-p", "--use-singularity", "--rerun-incomplete", "--keep-incomplete", - "--rerun-triggers", "mtime", "--verbose", "-s", snake_file, + "snakemake", "-p", "--cores", "2", "--use-singularity", "--rerun-incomplete", "--keep-incomplete", + "--rerun-triggers", "mtime", "--verbose", "-s", str(snake_file), ] if singularity_binds and not dry_run: - this_cmd.extend(["--singularity-args", f"\"--env 'TMPDIR=/tmp' -C -B '{singularity_binds}'\""]) + this_cmd.extend(["--singularity-args", f"\"-B '{singularity_binds}'\""]) if dry_run: print(f"{esc_colors.OKGREEN}> {esc_colors.ENDC}{esc_colors.UNDERLINE}Dry run{esc_colors.ENDC} " + \ @@ -314,4 +314,4 @@ def valid_host_pathogen_genomes(host, pathogen): if not g2: raise ValueError('Pathogen genome does not exist on the file system.') - return host, pathogen \ No newline at end of file + return host, pathogen diff --git a/utils b/utils index e22cdd0..ea09e92 160000 --- a/utils +++ b/utils @@ -1 +1 @@ -Subproject commit e22cdd01290872021357651c311a26962862b416 +Subproject commit ea09e92ea34500716c10e95744743ce95a6fa38b diff --git a/workflow/fastq.smk b/workflow/fastq.smk index 552fa84..0fdb83d 100644 --- a/workflow/fastq.smk +++ b/workflow/fastq.smk @@ -76,7 +76,7 @@ rule kaiju_annotation: log: config['out_to'] + "/logs/" + config["project"] + "/kaiju/{sids}.log", threads: 24 resources: - mem_mb = 220000, + mem_mb = 300000, runtime = 60*24*2 shell: """ @@ -109,7 +109,7 @@ rule kraken_annotation: log: config['out_to'] + "/logs/" + config["project"] + "/kraken/{sids}.log", threads: 24 resources: - mem_mb = 220000, + mem_mb = 300000, runtime = 60*24*2 shell: """