@@ -32,7 +32,7 @@ type Stat4 = object
3232 y_dp: RunningStat
3333
3434
35- type relation_matrices = object {. shallow .}
35+ type relation_matrices = object
3636 sites_tested: int
3737 ibs: seq [uint16 ]
3838 n: seq [uint16 ]
@@ -472,15 +472,17 @@ proc read_extracted*(paths: seq[string], min_ab: float, min_depth: int,
472472
473473const missing = [" ." , " 0" , " -9" , " " ]
474474
475- proc high_quality (gt_counts: array [5 , seq [uint16 ]], i: int ): bool {.inline .} =
475+ proc high_quality (gt_counts: array [5 , seq [uint16 ]], i: int ,
476+ check_hom_alts: bool ): bool {.inline .} =
476477 # less than 3% of sites with allele balance outside of 0.1 .. 0.9
477478 result = gt_counts[4 ][i].float / (gt_counts[0 ][i] + gt_counts[1 ][i] +
478479 gt_counts[2 ][i] + gt_counts[3 ][i] + gt_counts[4 ][i]).float < 0.06
479480 if not result :
480481 return false
481482
482- # should have fewer hom-alts[2] than hets[1]
483- result = gt_counts[2 ][i].float / max (1 , gt_counts[1 ][i].float ) < 0.7
483+ if check_hom_alts:
484+ # should have fewer hom-alts[2] than hets[1]
485+ result = gt_counts[2 ][i].float / max (1 , gt_counts[1 ][i].float ) < 0.7
484486
485487
486488proc samples_have_y_depth (stats: seq [Stat4 ]): bool =
@@ -534,9 +536,10 @@ proc related(final: var relation_matrices, L: SampleLooker,
534536
535537
536538proc add_parents_and_check_sex (final: var relation_matrices, stats: seq [Stat4 ],
537- gt_counts: array [5 , seq [uint16 ]], i: int , L: var SampleLooker ) =
539+ gt_counts: array [5 , seq [uint16 ]], i: int , L: var SampleLooker ,
540+ check_hom_alts: bool ) =
538541 # first pass here updates sample parents as needed.
539- if not gt_counts.high_quality (i):
542+ if not gt_counts.high_quality (i, check_hom_alts ):
540543 return
541544 let sample_name = L.sample_names[i]
542545 var sample = L.sample_table.getOrDefault (sample_name, Sample (id: sample_name,
@@ -548,6 +551,7 @@ proc add_parents_and_check_sex(final: var relation_matrices, stats: seq[Stat4],
548551 if sample.sex == 2 :
549552 stderr.write_line & " [somalier] setting sex to male for { sample.id} "
550553 sample.sex = 1
554+ assert sample.sex == 1
551555 elif stats[i].x_het / stats[i].x_hom_alt > 0.4 and stats[i].x_dp.n > 10 :
552556 if sample.sex != 2 :
553557 if sample.sex == 1 :
@@ -574,6 +578,7 @@ proc add_parents_and_check_sex(final: var relation_matrices, stats: seq[Stat4],
574578 if sample.maternal_id != possible_parents[1 ]:
575579 stderr.write_line & " [somalier] NOTE: updating maternal_id for { sample.id} to { possible_parents[1 ]} "
576580 sample.maternal_id = possible_parents[1 ]
581+ echo " sample:" , $ sample
577582 L.sample_table[sample.id] = sample
578583
579584proc add_parent_to_sibs (final: var relation_matrices, stats: seq [Stat4 ],
@@ -631,12 +636,13 @@ proc remove_spurious_parent_ids(final: var relation_matrices, L: SampleLooker,
631636
632637
633638proc add_siblings (final: var relation_matrices, stats: seq [Stat4 ],
634- gt_counts: array [5 , seq [uint16 ]], L: var SampleLooker ) =
639+ gt_counts: array [5 , seq [uint16 ]], L: var SampleLooker ,
640+ check_hom_alts: bool ) =
635641 for sample_name, sib_names in L.sib_pairs:
636642 let isample = L.sample_table[sample_name]
637643 let iset = sib_names.toHashSet
638644 let i = isample.i
639- if i >= 0 and not gt_counts.high_quality (i): continue
645+ if i >= 0 and not gt_counts.high_quality (i, check_hom_alts ): continue
640646 var ipids = [isample.paternal_id, isample.maternal_id]
641647 let parent_order = [" dad" , " mom" ]
642648 for sn in sib_names:
@@ -653,7 +659,7 @@ proc add_siblings(final: var relation_matrices, stats: seq[Stat4],
653659 j = jsample.i
654660 except KeyError :
655661 continue
656- if j >= 0 and not gt_counts.high_quality (j): continue
662+ if j >= 0 and not gt_counts.high_quality (j, check_hom_alts ): continue
657663 # if isample.paternal_id notin missing and isample.maternal_id notin missing and isample.paternal_id == jsample.paternal_id and isample.maternal_id == jsample.maternal_id: continue
658664 # # TODO: some logic problems below. check in CEPH
659665 var jpids = [jsample.paternal_id, jsample.maternal_id]
@@ -740,14 +746,15 @@ proc add_siblings(final: var relation_matrices, stats: seq[Stat4],
740746
741747
742748proc update_family_ids (final: relation_matrices, stats: seq [Stat4 ],
743- gt_counts: array [5 , seq [uint16 ]], i: int , L: var SampleLooker ) =
749+ gt_counts: array [5 , seq [uint16 ]], i: int , L: var SampleLooker ,
750+ check_hom_alts: bool ) =
744751 # 2nd pass here updates family ids
745752 let sample_name = L.sample_names[i]
746753 if sample_name notin L.sample_table:
747754 let s = Sample (id: sample_name, family_id: sample_name, sex: - 9 ,
748755 phenotype: " -9" , maternal_id: " -9" , paternal_id: " -9" )
749756 var sample = L.sample_table[sample_name]
750- if not gt_counts.high_quality (i): return
757+ if not gt_counts.high_quality (i, check_hom_alts ): return
751758
752759 let pids = [sample.paternal_id, sample.maternal_id]
753760 for pid in pids:
@@ -785,6 +792,7 @@ proc write_sample(fh: File, stats: seq[Stat4], gt_counts: array[5, seq[uint16]],
785792 var sample = L.sample_table.getOrDefault (sample_name, Sample (id: sample_name,
786793 family_id: sample_name, sex: - 9 , phenotype: " -9" , maternal_id: " -9" ,
787794 paternal_id: " -9" ))
795+ echo " sample final:" , $ sample, " sex:" , sample.sex
788796 fh.write (& " { sample.family_id} \t { sample.id} \t { sample.paternal_id} \t { sample.maternal_id} \t { sample.sex} \t { sample.phenotype} \t " )
789797 fh.write (& " { L.sample_sex.getOrDefault (sample.id, \ " -9\" )}\t " )
790798 fh.write(& " {stats[i].gtdp.mean:.1 f}\ t{stats[i].gtdp.standard_deviation ():.1 f}\ t " )
@@ -880,15 +888,16 @@ proc write_ped(fh: File, final: var relation_matrices, stats: seq[Stat4],
880888
881889
882890 if infer:
891+ let check_hom_alts = getEnv (" SOMALIER_CHECK_HOM_ALTS" ) != " "
883892 final.remove_spurious_parent_ids (L, stats)
884893 for i, sample_name in L.sample_names:
885- add_parents_and_check_sex (final, stats, gt_counts, i, L)
886- add_siblings (final, stats, gt_counts, L)
894+ add_parents_and_check_sex (final, stats, gt_counts, i, L, check_hom_alts )
895+ add_siblings (final, stats, gt_counts, L, check_hom_alts )
887896
888897 add_parent_to_sibs (final, stats, gt_counts, L)
889898
890899 for i, sample_name in L.sample_names:
891- update_family_ids (final, stats, gt_counts, i, L)
900+ update_family_ids (final, stats, gt_counts, i, L, check_hom_alts )
892901 final.remove_spurious_parent_ids (L, stats)
893902
894903 for i, sample_name in L.sample_names:
@@ -1037,6 +1046,8 @@ proc rel_main*() =
10371046
10381047 t0 = cpuTime ()
10391048
1049+ let check_hom_alts = getEnv (" SOMALIER_CHECK_HOM_ALTS" ) != " "
1050+
10401051 var tmpls = tmpl_html.split (" <INPUT_JSON>" )
10411052 fh_html.write (tmpls[0 ].replace (" <SAMPLE_JSON>" , toj (final.samples,
10421053 final.stats, final.gt_counts, samples.to_sex_lookup)))
@@ -1092,8 +1103,8 @@ proc rel_main*() =
10921103 stderr.write_line & " [somalier] apparent identical twins or sample duplicate found with { rel.sample_a} and { rel.sample_b} assuming siblings as these were specified as such in the pedigree file "
10931104 sib_pairs.mgetOrPut (rel.sample_a, @ []).add (rel.sample_b)
10941105 sib_pairs.mgetOrPut (rel.sample_b, @ []).add (rel.sample_a)
1095- elif rr > 0.2 and final.gt_counts.high_quality (T.i) and
1096- final.gt_counts.high_quality (T.j):
1106+ elif rr > 0.2 and final.gt_counts.high_quality (T.i, check_hom_alts ) and
1107+ final.gt_counts.high_quality (T.j, check_hom_alts ):
10971108 relGt0p2.mgetOrPut (rel.sample_a, @ []).add (rel.sample_b)
10981109 relGt0p2.mgetOrPut (rel.sample_b, @ []).add (rel.sample_a)
10991110
0 commit comments