korem-lab · Bowen0715 · May 9, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/LargeDataTips.md b/LargeDataTips.md
@@ -0,0 +1,87 @@
+# Tips for using ICRA on large datasets with a large reference database
+
+## Overview
+
+The ICRA algorithm can be broken down into four steps in the view of generating new files:
+
+1. fastq -> bam
+2. bam -> pmp
+3. pmp -> jsdel + jspi
+4. jsdel -> smp
+
+*The `icra` command includes `svfinder get_sample_map` which generates smp files*
+
+When using the `icra` command on large datasets with a large reference database, significant time and memory are required.
+By separating those steps above and utilizing parallel computing, the process can be expedited.
+However, considering the CPU core count, disk storage space, and memory size of the computer or server, each step requires different setting.
+The arguments `bamfol` and `pmpfol` allow files to be distributed separately.
+Here is an example demonstrating the whole ICRA algorithm (*30G reference database*).
+
+| No. | Files | Multi-threads | Memory Usage |  Time Usage|File size (Example) |
+|---|---|---|---|---|---|
+|0|fastq|-|-|-|16G|
+|1|fastq -> bam|Yes|-|-|8.3G|
+|2|bam -> pmp|No|Low (<1G)|1.5h|11G|
+|3|pmp -> jsdel + jspi|No|High (30G~100G)|4h~2days|913M (jsdel) + 11K (jspi)|
+|4|jsdel -> smp|No|High (32.92G)|4min|303M|
+
+## Step 1: BAM file
+
+The first step involves using bowtie2 and samtools to generate bam files from the fastq files.
+This can be done using bowtie2 and samtools directly or using the `generate_bam` command in `icra` with multiple threads.
+With the argument `--generate_bam`, the `icra` command will only generate bam files.
+The default command of the `generate_bam` is this:
+
+```bash
+bowtie2 -x ${db}/${db_name} --very-sensitive -k 20 -U ${dir}/${filename}.fastq --quiet -p ${threads} \
+    | samtools view -bS -@ ${threads} - > ${bam_dir}/${filename}.bam
+```
+
+## Step 2: PMP file
+
+The second step uses the function `sam2pmp` in the script `SGVFinder2\helpers\sam2pmp.py`.
+This step cannot use multithreading but consumes minimal memory (less than 1 GB) and takes a long time (1~6 hours for 16G fastq).
+Therefore, it is feasible and recommended to analyze multiple samples concurrently.
+This step can be executed separately using the `bam_to_pmp` argument in the `icra` command.
+With the argument `--bam_to_pmp`, the `icra` command will only convert bam files to pmp files.
+
+## Step 3~4: ICRA
+
+With the bam and pmp files available and their paths properly specified, the `icra` command will bypass step 1-2 and continue to generate jsdel files.
+Step 3 consumes a large amount of memory (30GB-100G for 16Gfastq) and considerable time (3min-20ming per iteration for 16Gfastq, 4h-2days total) with the default argument (epsilon=1e-6,
+max_iterations=100).
+Step 4 is the same as `svfinder get_sample_map`, which consumes large memory but not much time. So it is not necessary to separate step 4 from step 3.
+
+## Conclusion
+
+Step 1 can utilize multiple threads and/or analyze multiple samples concurrently. Step 2 can analyze multiple samples concurrently, using `rush` or `parallel`
+This makes full use of the computer or server resources and shortens the analysis time.
+Step 3 consumes a large and uncertain amount of memory, making it the bottleneck of the entire ICRA analysis process.
+
+Here are some example codes:
+
+```bash
+fqdir=~/test.fastq
+db=~/db/db_name
+bamdir=~/bam
+pmpdir=~/pmp
+outdir=~/
+listdir=~/list.txt # sample name list
+threads=30
+# Using specified bamfol and pmpfol
+icra --generate_bam --bamfol ${bamdir} --fq1 ${fqdir} --db ${db} --threads ${threads}
+icra --bam_to_pmp --bamfol ${bamdir} --pmpfol ${pmpdir} --fq1 ${fqdir}
+icra --bamfol ${bamdir} --pmpfol ${pmpdir} \
+    --outfol ${outdir} --fq1 ${fqdir} \
+    --db ${db} --use_theta --debug > sgv.log 2>&1
+
+# Using only outfol
+icra --generate_bam --outfol ${outdir} --fq1 ${fqdir} --db ${db} --threads ${threads}
+icra --bam_to_pmp --outfol ${outdir} --fq1 ${fqdir}
+icra --outfol ${outdir} --fq1 ${fqdir} \
+    --db ${db} --use_theta --debug > sgv.log 2>&1
+
+# Parallel computing using rush
+cat ${listdir} | rush -j ${threads} \
+    "icra --bam_to_pmp --outfol ${outdir} --fq1 ${fqdir}"
+```
diff --git a/SGVFinder2/__init__.py b/SGVFinder2/__init__.py
@@ -2,6 +2,8 @@
 from .svfinder import get_sample_map, work_on_collection
 from .createdb import create_db_from_reps
 from distutils.spawn import find_executable
+from .helpers.Bowtie2WrapperSlim import do_pair_simple
+from .helpers.sam2pmp import sam2pmp
 
 assert find_executable('bowtie2') is not None, 'Cannot find `bowtie2` installation'
 assert find_executable('samtools') is not None, 'Cannot find `samtools` installation'
diff --git a/SGVFinder2/cli/icra_cli.py b/SGVFinder2/cli/icra_cli.py
@@ -3,11 +3,15 @@
 from pandas import to_pickle
 from ..icra import single_file
 from ..svfinder import get_sample_map
-from ..helpers.Bowtie2WrapperSlim import MapPreset
+from ..helpers.Bowtie2WrapperSlim import MapPreset, do_pair_simple
+from ..helpers.sam2pmp import sam2pmp
+from os.path import join, basename, exists
 
 parser = argparse.ArgumentParser()
 
 parser.add_argument('--outfol', help='Where to write results')
+parser.add_argument('--bamfol', help='Path to generate or to the exists bam file (defaults to outfol)')
+parser.add_argument('--pmpfol', help='Path to generate or to the exists pmp file (defaults to outfol)')
 parser.add_argument('--fq1', help='Path to first fastq file')
 parser.add_argument('--fq2', help='Path to second fastq file (defaults to None)', default=None)
 parser.add_argument('--db', help='Database path (including db prefix)', default=None)
@@ -29,36 +33,68 @@
 parser.add_argument('--max_ins', help='Bowtie2 -X parameter, the maximum fragment length for valid paired-end alignments', default=750)
 parser.add_argument('--x_coverage', help = '`get_sample_map` param- the desired coverage across the genome in units of 100bp reads. This parameter is used to determine bin size: bin_size = rate_param/x_coverage (Default = 0.01)', type=float, default = 0.01)
 parser.add_argument('--rate_param', help = '`get_sample_map` param- the lower limit for the median number of reads per genomic bin. Genomes with coverage lower than rate_param will be discarded from the analysis (default = 10)', type = int, default = 10)
-
+
+parser.add_argument('--generate_bam', action='store_true', help='Only generate BAM files from FASTQ files using bowtie2 and samtools.')
+parser.add_argument('--bam_to_pmp', action='store_true', help='Only convert BAM files to PMP files. Use this after generating BAM files with bowtie2 and samtools (`generate_bam` command).')
+
 args = parser.parse_args()
 logging.basicConfig(level=(logging.DEBUG if args.debug else logging.INFO),
                     format='%(asctime)s-%(levelname)s: %(message)s')
 logging.debug(args)
 
 def run():
-    print('Running ICRA `single_file` command...')
-    jspi_file, jsdel_file = single_file(
-        fq1= args.fq1,
-        fq2 = args.fq2,
-        outfol=args.outfol,
-        max_mismatch=args.max_mismatch,
-        consider_lengths=args.consider_lengths,
-        epsilon=args.epsilon,
-        max_iterations=args.max_iterations,
-        min_bins=args.min_bins,
-        max_bins=args.max_bins,
-        min_reads=args.min_reads,
-        dense_region_coverage=args.dense_region_coverage,
-        length_minimum=args.length_minimum,
-        length_maximum=args.length_maximum,
-        dbpath=args.db,
-        use_theta=args.use_theta,
-        threads=args.threads,
-        senspreset=args.sensitivity,
-        report_alns=args.report_alignments,
-        max_ins=args.max_ins
-    )
-    print('Finished running ICRA, saving results to %s'%args.outfol)
-    print('Running `get_sample_map` on %s, output will be saved to %s'%(jsdel_file,jsdel_file.replace('.jsdel','.smp')))
-    sample_map = get_sample_map(jsdel_file,args.db+'.dlen',args.x_coverage, args.rate_param) 
-    to_pickle(sample_map,jsdel_file.replace('.jsdel','.smp'))
+    fq1= args.fq1
+    fq2 = args.fq2
+    if args.bamfol is None:
+        args.bamfol = args.outfol
+    if args.pmpfol is None:
+        args.pmpfol = args.outfol
+    outfol = args.outfol
+    bamfol = args.bamfol
+    pmpfol = args.pmpfol
+    file_basename = basename(fq1).replace('_1.fastq.gz', '').replace('_1.fastq', '').replace('.fastq.gz', '').replace(
+            '.fastq', '').replace('_1.fq.gz', '').replace('_1.fq', '').replace('.fq.gz', '').replace('.fq', '')
+    bampref = join(bamfol, file_basename)
+    if args.generate_bam:
+        print('Running ICRA `generate_bam` command...')
+        do_pair_simple(fq1, fq2, bampref, 
+                       indexf = args.db, 
+                       senspreset=args.sensitivity, 
+                       report_alns=args.report_alignments, 
+                       max_ins=args.max_ins, 
+                       threads=args.threads)
+        print('Mapped. BAM ready')
+    elif args.bam_to_pmp:
+        print('Running ICRA `bam_to_pmp` command...')
+        pmppref = join(pmpfol, file_basename) 
+        if exists(bampref + '.bam'):
+            print('%s exists, converting to pmp...'%(bampref + '.bam'))
+            sam2pmp(bampref + '.bam', pmppref + '.pmp', full=True)
+            print('Finished running bam_to_pmp, saving results to %s'%args.pmpfol)
+        else:
+            print('NO BAM files found in %s. Please run ICRA `generate_bam` command first!'%(bamfol))
+    else:
+        print('Running ICRA `single_file` command...')
+        jspi_file, jsdel_file = single_file(
+            fq1, fq2, bamfol, pmpfol, outfol,
+            max_mismatch=args.max_mismatch,
+            consider_lengths=args.consider_lengths,
+            epsilon=args.epsilon,
+            max_iterations=args.max_iterations,
+            min_bins=args.min_bins,
+            max_bins=args.max_bins,
+            min_reads=args.min_reads,
+            dense_region_coverage=args.dense_region_coverage,
+            length_minimum=args.length_minimum,
+            length_maximum=args.length_maximum,
+            dbpath=args.db,
+            use_theta=args.use_theta,
+            threads=args.threads,
+            senspreset=args.sensitivity,
+            report_alns=args.report_alignments,
+            max_ins=args.max_ins
+        )
+        print('Finished running ICRA, saving results to %s'%args.outfol)
+        print('Running `get_sample_map` on %s, output will be saved to %s'%(jsdel_file,jsdel_file.replace('.jsdel','.smp')))
+        sample_map = get_sample_map(jsdel_file,args.db+'.dlen',args.x_coverage, args.rate_param) 
+        to_pickle(sample_map,jsdel_file.replace('.jsdel','.smp'))
diff --git a/SGVFinder2/icra.py b/SGVFinder2/icra.py
@@ -34,7 +34,8 @@ def get_ujson_splitting_point(delta):
 
 
 def single_file(
-        fq1, fq2, 
+        fq1, fq2,
+        bamfol, pmpfol,
         outfol=os.getcwd(),
         max_mismatch=8,
         consider_lengths=False,
@@ -57,6 +58,12 @@ def single_file(
         print('Creating directory %s...'%outfol)
         Path(outfol).mkdir(parents=True, exist_ok=True)
 
+    if bamfol is None:
+        bamfol = outfol
+
+    if pmpfol is None:
+        pmpfol = outfol
+
     if fq2 is None:
         print('Running ICRA on single-end read!')
         print('Single-',fq1)
@@ -68,27 +75,28 @@ def single_file(
 
     log_.debug('Loading database...')
     length_db_f, indexf, dest_dictf, dlen_db_f = dbpath + '.lengths', dbpath, dbpath + '.dests', dbpath + '.dlen'
-    outpref = join(outfol,
-                   basename(fq1).replace('_1.fastq.gz', '').replace('_1.fastq', '').replace('.fastq.gz', '').replace(
-                       '.fastq', ''))
+    file_basename = basename(fq1).replace('_1.fastq.gz', '').replace('_1.fastq', '').replace('.fastq.gz', '').replace(
+            '.fastq', '').replace('_1.fq.gz', '').replace('_1.fq', '').replace('.fq.gz', '').replace('.fq', '')
+    bampref = join(bamfol, file_basename) 
+    pmppref = join(pmpfol, file_basename) 
+    outpref = join(outfol, file_basename)
     log_.debug('Loaded. Mapping file...')
 
-    if not exists(outpref + '.bam'):
-        do_pair_simple(fq1, fq2, outpref, indexf, senspreset, report_alns, max_ins, threads)
+    if not exists(bampref + '.bam'):
+        do_pair_simple(fq1, fq2, bampref, indexf, senspreset, report_alns, max_ins, threads)
     else:
-        print('%s already exists, skipping mapping step...'%(outpref + '.bam'))
-
+        print('%s already exists, skipping mapping step...'%(bampref + '.bam'))
     log_.debug('Mapped. Converting to pmp')
-    if not exists(outpref + '.pmp'):
-        sam2pmp(outpref + '.bam', outpref + '.pmp', full=True)
+    if not exists(pmppref + '.pmp'):
+        sam2pmp(bampref + '.bam', pmppref + '.pmp', full=True)
     #AYA DEBUG - i commented it out to save bam
     #_tryrm(outpref + '.bam')
     log_.debug('PMP ready. Running ICRA...')
     #if not exists('delta_new_for_debug.pkl'):
-    if not exists(outpref + '_1.jsdel'):
+    if not exists(outpref + '.jsdel'):
 
         read_container, pi, theta1, average_read_length, lengthdb = \
-            _initialize(fq1, fq2, outpref + '.pmp', dlen_db_f, max_mismatch, consider_lengths, length_minimum,
+            _initialize(fq1, fq2, pmppref + '.pmp', dlen_db_f, max_mismatch, consider_lengths, length_minimum,
                         length_maximum,
                         min_reads, min_bins, max_bins, dense_region_coverage, dest_dictf, use_theta)
         if len(pi) == 0:

diff --git a/SGVFinder2/svfinder.py b/SGVFinder2/svfinder.py
@@ -436,7 +436,10 @@ def _cooc_dissim(v, u):
 
 
 def _spearman_dissim(v, u):
-    return 1 - ((spearmanr(v, u)[0] + 1) / 2)
+    correlation, _ = spearmanr(v, u, nan_policy='omit')
+    if np.isnan(correlation):
+        correlation = 0
+    return 1 - ((correlation + 1) / 2)
 
 
 def draw_one_region(bacname, binsize, taxonomy, normdf,
@@ -459,42 +462,46 @@ def draw_one_region(bacname, binsize, taxonomy, normdf,
                 p1.vbar(s[0] + (s[1] - s[0]) / 2. - .5, s[1] - s[0] - .5, dfmax, dfmin,
                         color='DarkBlue', alpha=0.2)
     drawdf = DataFrame({i: normdf[i] if i in normdf.columns else np.nan for i in bacdf.columns})
-    p1.line(range(drawdf.shape[1]), drawdf.quantile(0.01), color='Black', alpha=.7)
-    p1.line(range(drawdf.shape[1]), drawdf.quantile(0.25), color='Black', alpha=1)
-    p1.line(range(drawdf.shape[1]), drawdf.median(), color='Black', alpha=1, line_width=2.5)
-    p1.line(range(drawdf.shape[1]), drawdf.quantile(0.75), color='Black', alpha=1)
-    p1.line(range(drawdf.shape[1]), drawdf.quantile(0.99), color='Black', alpha=.7)
-    locgeneposs = geneposs.loc[bacname]
-    locgeneposs = locgeneposs[~locgeneposs.isnull().any(1)]
-    noise = norm.rvs(0, 0.75, size=len(locgeneposs))
-    source = ColumnDataSource(data=dict(
-        x=locgeneposs[['start_pos', 'end_pos']].mean(1).truediv(binsize).values,
-        xs=list(zip(locgeneposs['start_pos'].truediv(binsize).values, locgeneposs['end_pos'].truediv(binsize).values)),
-        y=locgeneposs['strand'].replace({'+': 2, '-': -2}),
-        ys=list(zip(locgeneposs['strand'].replace({'+': 2, '-': -2}).add(noise),
-               locgeneposs['strand'].replace({'+': 2, '-': -2}).add(noise))),
-        start=locgeneposs.start_pos.apply(int).apply(str).values,
-        end=locgeneposs.end_pos.apply(int).apply(str).values,
-        strand=locgeneposs.strand.values,
-        gene_name=locgeneposs.index.values,
-        product=locgeneposs['product'].values,
-        gene_type=locgeneposs['gene_type'].values,
-        color=locgeneposs['gene_type'].apply(lambda x: 'DarkRed' if x == 'CDS' else 'DodgerBlue' if x == 'rRNA' else 'Yellow' if x == 'tRNA' else 'Gray').values))
-    hover = HoverTool(tooltips=[
-        ("Gene name", "@gene_name"),
-        ("Position", "@start - @end (@strand)"),
-        ("Gene type", "@gene_type"),
-        ("Product", "@product"),
-    ],
-        mode='vline')
-    p2 = bpl.figure(width=1200, height=150, x_range=p1.x_range,
-                    title=None, tools=[hover, 'tap'])
-    p2.multi_line('xs', 'ys', line_width=4, color='color', source=source)
-    url = "http://www.google.com/search?q=@gene_name"
-    taptool = p2.select(type=TapTool)
-    taptool.callback = OpenURL(url=url)
-    p2.yaxis.axis_label = 'Genes'
-    p2.y_range = Range1d(-5, 5)
+    p1.line(list(range(drawdf.shape[1])), drawdf.quantile(0.01), color='Black', alpha=.7)
+    p1.line(list(range(drawdf.shape[1])), drawdf.quantile(0.25), color='Black', alpha=1)
+    p1.line(list(range(drawdf.shape[1])), drawdf.median(), color='Black', alpha=1, line_width=2.5)
+    p1.line(list(range(drawdf.shape[1])), drawdf.quantile(0.75), color='Black', alpha=1)
+    p1.line(list(range(drawdf.shape[1])), drawdf.quantile(0.99), color='Black', alpha=.7)
+    # Gene positions
+    if geneposs is not None:
+        locgeneposs = geneposs.loc[bacname]
+        locgeneposs = locgeneposs[~locgeneposs.isnull().any(1)]
+        noise = norm.rvs(0, 0.75, size=len(locgeneposs))
+        source = ColumnDataSource(data=dict(
+            x=locgeneposs[['start_pos', 'end_pos']].mean(1).truediv(binsize).values,
+            xs=list(zip(locgeneposs['start_pos'].truediv(binsize).values, locgeneposs['end_pos'].truediv(binsize).values)),
+            y=locgeneposs['strand'].replace({'+': 2, '-': -2}),
+            ys=list(zip(locgeneposs['strand'].replace({'+': 2, '-': -2}).add(noise),
+                locgeneposs['strand'].replace({'+': 2, '-': -2}).add(noise))),
+            start=locgeneposs.start_pos.apply(int).apply(str).values,
+            end=locgeneposs.end_pos.apply(int).apply(str).values,
+            strand=locgeneposs.strand.values,
+            gene_name=locgeneposs.index.values,
+            product=locgeneposs['product'].values,
+            gene_type=locgeneposs['gene_type'].values,
+            color=locgeneposs['gene_type'].apply(lambda x: 'DarkRed' if x == 'CDS' else 'DodgerBlue' if x == 'rRNA' else 'Yellow' if x == 'tRNA' else 'Gray').values))
+        hover = HoverTool(tooltips=[
+            ("Gene name", "@gene_name"),
+            ("Position", "@start - @end (@strand)"),
+            ("Gene type", "@gene_type"),
+            ("Product", "@product"),
+        ],
+            mode='vline')
+        p2 = bpl.figure(width=1200, height=150, x_range=p1.x_range,
+                        title=None, tools=[hover, 'tap'])
+        p2.multi_line('xs', 'ys', line_width=4, color='color', source=source)
+        url = "http://www.google.com/search?q=@gene_name"
+        taptool = p2.select(type=TapTool)
+        taptool.callback = OpenURL(url=url)
+        p2.yaxis.axis_label = 'Genes'
+        p2.y_range = Range1d(-5, 5)
+    else:
+        p2 = None
     # Deletions
     if p3 is not None:
         p3.xaxis.axis_label = 'Bin position'
@@ -508,7 +515,7 @@ def draw_one_region(bacname, binsize, taxonomy, normdf,
                     p3.vbar(d[0] + (d[1] - d[0]) / 2. - .5, d[1] - d[0] - .5, 1.5, 0,
                             color='ForestGreen', alpha=0.5)
         a = 1 - deldf.sum(0).truediv(deldf.count(0))
-        p3.line(range(len(a)), a.values, color='FireBrick', alpha=1, line_width=2.5)
+        p3.line(list(range(len(a))), a.values, color='FireBrick', alpha=1, line_width=2.5)
         # Output
         bpl.output_file(join(outputdir, bacname + '.html'),
                         title=tax)