22
33import os
44import re
5- import sys
65import csv
76import cyvcf2
87import gzip
1110from logging import Logger
1211
1312from pcgr import utils , pcgr_vars
14- from pcgr .utils import check_subprocess , error_message
13+ from pcgr .utils import error_message
1514from pcgr .variant import reverse_complement_dna
1615
1716csv .field_size_limit (500 * 1024 * 1024 )
@@ -37,7 +36,7 @@ def read_infotag_file(vcf_info_tags_tsv, scope = "vep"):
3736 tsvfile = open (vcf_info_tags_tsv , 'r' )
3837 reader = csv .DictReader (tsvfile , delimiter = '\t ' )
3938 for row in reader :
40- if not row ['tag' ] in info_tag_xref :
39+ if row ['tag' ] not in info_tag_xref :
4140 if scope in row ['category' ]:
4241 #print(scope + '\t' + str(row['category']) + '\t' + str(row['tag']))
4342 info_tag_xref [row ['tag' ]] = row
@@ -70,7 +69,7 @@ def read_vcfanno_tag_file(vcfanno_tag_file, logger):
7069 if elem .startswith ('Description' ):
7170 row ['description' ] = re .sub (r'">|Description="' , '' , elem )
7271
73- if 'tag' in row and not row ['tag' ] in infotag_results :
72+ if 'tag' in row and row ['tag' ] not in infotag_results :
7473 infotag_results [row ['tag' ]] = row
7574
7675 return (infotag_results )
@@ -147,7 +146,7 @@ def map_regulatory_variant_annotations(vep_csq_records):
147146 while j < len (vep_csq_records ):
148147 missing_key_annotation = False
149148 for k in ['Feature_type' , 'Consequence' , 'Feature' ]:
150- if not k in vep_csq_records [j ].keys ():
149+ if k not in vep_csq_records [j ].keys ():
151150 missing_key_annotation = True
152151
153152 if missing_key_annotation is False :
@@ -170,7 +169,7 @@ def map_regulatory_variant_annotations(vep_csq_records):
170169 missing_motif_annotation = False
171170 for annotation in ['MOTIF_NAME' , 'MOTIF_POS' , 'HIGH_INF_POS' ,
172171 'MOTIF_SCORE_CHANGE' , 'TRANSCRIPTION_FACTORS' ]:
173- if not annotation in vep_csq_records [j ].keys ():
172+ if annotation not in vep_csq_records [j ].keys ():
174173 missing_motif_annotation = True
175174
176175 if missing_motif_annotation is False :
@@ -263,20 +262,15 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
263262 if re .search (pcgr_vars .CSQ_NULL_PATTERN , str (csq_record ['Consequence' ])) is not None :
264263 csq_record ['NULL_VARIANT' ] = True
265264
266- if not csq_record ['MaxEntScan_diff' ] is None and not csq_record ['MaxEntScan_ref' ] is None and not csq_record ['MaxEntScan_alt' ] is None :
267- fraction_drop = 0.0
268- #if float(csq_record['MaxEntScan_ref']) > 0:
269- # fraction_drop = float(csq_record['MaxEntScan_diff']) / float(csq_record['MaxEntScan_ref']).round(4)
270- #else:
271- # fraction_drop = 0.0
265+ if csq_record ['MaxEntScan_diff' ] is not None and csq_record ['MaxEntScan_ref' ] is not None and csq_record ['MaxEntScan_alt' ] is not None :
272266 csq_record ['MAXENTSCAN' ] = 'MES|' + str (csq_record ['MaxEntScan_diff' ]) + '|' + \
273267 str (csq_record ['MaxEntScan_ref' ]) + '|' + str (csq_record ['MaxEntScan_alt' ]) #+ \
274268 #'|' + str(fraction_drop)
275269
276270 if re .search (pcgr_vars .CSQ_SPLICE_DONOR_PATTERN , str (csq_record ['Consequence' ])) is not None :
277271 if re .search (r'(\+3(A|G)>|\+4A>|\+5G>)' , str (csq_record ['HGVSc' ])) is not None :
278272 csq_record ['SPLICE_DONOR_RELEVANT' ] = True
279- if not csq_record ['MaxEntScan_diff' ] is None and re .search ('splice_donor_(5th|variant)' ,str (csq_record ['Consequence' ])) is not None :
273+ if csq_record ['MaxEntScan_diff' ] is not None and re .search ('splice_donor_(5th|variant)' ,str (csq_record ['Consequence' ])) is not None :
280274 if abs (csq_record ['MaxEntScan_diff' ]) >= pcgr_vars .DONOR_DISRUPTION_MES_CUTOFF :
281275 csq_record ['LOSS_OF_FUNCTION' ] = True
282276 csq_record ['MAXENTSCAN' ] = str (csq_record ['MAXENTSCAN' ]) + '|DONOR_DISRUPTING'
@@ -287,7 +281,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
287281 csq_record ['MAXENTSCAN' ] = str (csq_record ['MAXENTSCAN' ]) + '|NON_DONOR_DISRUPTING'
288282
289283 if re .search (pcgr_vars .CSQ_SPLICE_ACCEPTOR_PATTERN , str (csq_record ['Consequence' ])) is not None :
290- if not csq_record ['MaxEntScan_diff' ] is None and re .search ('splice_acceptor' , str (csq_record ['Consequence' ])) is not None :
284+ if csq_record ['MaxEntScan_diff' ] is not None and re .search ('splice_acceptor' , str (csq_record ['Consequence' ])) is not None :
291285 if abs (csq_record ['MaxEntScan_diff' ]) >= pcgr_vars .ACCEPTOR_DISRUPTION_MES_CUTOFF :
292286 csq_record ['LOSS_OF_FUNCTION' ] = True
293287 csq_record ['MAXENTSCAN' ] = str (csq_record ['MAXENTSCAN' ]) + '|ACCEPTOR_DISRUPTING'
@@ -307,7 +301,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
307301 csq_record ['INTRON_POSITION' ] = int (pos )
308302
309303 if 'NearestExonJB' in csq_record .keys ():
310- if not csq_record ['NearestExonJB' ] is None :
304+ if csq_record ['NearestExonJB' ] is not None :
311305 if re .search (r"synonymous_|missense_|stop_|frameshift|inframe_|start_" , str (csq_record ['Consequence' ])) is not None and \
312306 str (csq_record ['NearestExonJB' ]) != "" :
313307 exon_pos_info = csq_record ['NearestExonJB' ].split ("+" )
@@ -319,7 +313,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
319313
320314 ## filter putative LOF variants if they occur too close to the CDS end (less than 5% of the CDS length remains after the variant)
321315 if 'CDS_position' in csq_record .keys ():
322- if not csq_record ['CDS_position' ] is None and csq_record ['LOSS_OF_FUNCTION' ] is True and splice_variant is False :
316+ if csq_record ['CDS_position' ] is not None and csq_record ['LOSS_OF_FUNCTION' ] is True and splice_variant is False :
323317 if csq_record ['CDS_position' ] != '.' :
324318 if '/' in csq_record ['CDS_position' ]:
325319 cds_length = str (csq_record ['CDS_position' ]).split ('/' )[1 ]
@@ -332,7 +326,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
332326 cds_pos_full = str (csq_record ['CDS_position' ]).split ('/' )[0 ]
333327
334328 ## Frameshift variants are listed with a range (separated by '-'), choose start position
335- if '-' in cds_pos_full and not '?' in cds_pos_full :
329+ if '-' in cds_pos_full and '?' not in cds_pos_full :
336330 cds_pos = cds_pos_full .split ('-' )[0 ]
337331 if cds_pos .isdigit ():
338332 cds_pos = int (cds_pos )
@@ -350,15 +344,15 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
350344 csq_record ['LOF_FILTER' ] = "END_TRUNCATION"
351345
352346 if 'HGVSc' in csq_record .keys () and 'Consequence' in csq_record .keys ():
353- if not csq_record ['HGVSc' ] is None :
347+ if csq_record ['HGVSc' ] is not None :
354348 if csq_record ['HGVSc' ] != '.' :
355349
356350 if len (str (csq_record ['HGVSc' ]).split (':' )) == 2 :
357351 csq_record ['ALTERATION' ] = str (csq_record ['HGVSc' ].split (':' )[1 ])
358352
359353 ## Use RefSeq transcript ID (MANE SELECT) for HGVSc if available
360354 csq_record ['HGVSc_RefSeq' ] = '.:.'
361- if not csq_record ['MANE_SELECT' ] is None :
355+ if csq_record ['MANE_SELECT' ] is not None :
362356 if ":" in csq_record ['HGVSc' ]:
363357 hgvsc_data = csq_record ['HGVSc' ].split (':' )
364358 if len (hgvsc_data ) == 2 :
@@ -367,22 +361,22 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
367361 ## GRCh37 - MANE_SELECT not provided by VEP for GRCh37, so use MANE_SELECT2 (customly provided through geneOncoX)
368362 else :
369363 if 'MANE_SELECT2' in csq_record .keys ():
370- if not csq_record ['MANE_SELECT2' ] is None :
364+ if csq_record ['MANE_SELECT2' ] is not None :
371365 if ":" in csq_record ['HGVSc' ]:
372366 hgvsc_data = csq_record ['HGVSc' ].split (':' )
373367 if len (hgvsc_data ) == 2 :
374368 csq_record ['HGVSc_RefSeq' ] = str (csq_record ['MANE_SELECT2' ]) + ':' + str (hgvsc_data [1 ])
375369 else :
376370 if 'REFSEQ_SELECT' in csq_record .keys ():
377- if not csq_record ['REFSEQ_SELECT' ] is None :
371+ if csq_record ['REFSEQ_SELECT' ] is not None :
378372 csq_record ['REFSEQ_SELECT' ] = str (csq_record ['REFSEQ_SELECT' ].split ('&' )[0 ])
379373 if ":" in csq_record ['HGVSc' ]:
380374 hgvsc_data = csq_record ['HGVSc' ].split (':' )
381375 if len (hgvsc_data ) == 2 :
382376 csq_record ['HGVSc_RefSeq' ] = str (csq_record ['REFSEQ_SELECT' ]) + ':' + str (hgvsc_data [1 ])
383377 else :
384378 if 'REFSEQ_TRANSCRIPT_ID' in csq_record .keys ():
385- if not csq_record ['REFSEQ_TRANSCRIPT_ID' ] is None :
379+ if csq_record ['REFSEQ_TRANSCRIPT_ID' ] is not None :
386380 csq_record ['REFSEQ_TRANSCRIPT_ID' ] = str (csq_record ['REFSEQ_TRANSCRIPT_ID' ].split ('&' )[0 ])
387381 if ":" in csq_record ['HGVSc' ]:
388382 hgvsc_data = csq_record ['HGVSc' ].split (':' )
@@ -404,7 +398,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
404398 protein_change = '.'
405399
406400 if 'Protein_position' in csq_record .keys ():
407- if not csq_record ['Protein_position' ] is None :
401+ if csq_record ['Protein_position' ] is not None :
408402 if not csq_record ['Protein_position' ].startswith ('-' ) and csq_record ['Protein_position' ] != '.' :
409403 if '/' in csq_record ['Protein_position' ]:
410404 protein_position = str (csq_record ['Protein_position' ].split ('/' )[0 ])
@@ -419,7 +413,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
419413 csq_record ['AMINO_ACID_END' ] = protein_position
420414
421415 if 'synonymous_variant' in csq_record ['Consequence' ] and 'Amino_acids' in csq_record .keys ():
422- if not csq_record ['Amino_acids' ] is None :
416+ if csq_record ['Amino_acids' ] is not None :
423417 protein_change = 'p.' + \
424418 str (csq_record ['Amino_acids' ]) + \
425419 str (protein_position ) + str (csq_record ['Amino_acids' ])
@@ -429,7 +423,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
429423 str (protein_position ) + \
430424 str (csq_record ['Amino_acids' ]).split ('/' )[1 ]
431425
432- if not csq_record ['HGVSp' ] is None :
426+ if csq_record ['HGVSp' ] is not None :
433427 if csq_record ['HGVSp' ] != '.' :
434428 if ':' in csq_record ['HGVSp' ]:
435429 protein_identifier = str (csq_record ['HGVSp' ].split (':' )[0 ])
@@ -439,7 +433,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
439433
440434 if 'missense_variant' in csq_record ['Consequence' ] and not re .search (r'(del|ins|fs|X)' , protein_change ):
441435 if 'Amino_acids' in csq_record .keys ():
442- if not csq_record ['Amino_acids' ] is None :
436+ if csq_record ['Amino_acids' ] is not None :
443437 if '/' in str (csq_record ['Amino_acids' ]):
444438 aaref = str (csq_record ['Amino_acids' ]).split ('/' )[0 ]
445439 aalt = str (csq_record ['Amino_acids' ]).split ('/' )[1 ]
@@ -462,20 +456,20 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
462456 'ENSEMBL_TRANSCRIPT_ID' in csq_record .keys () and \
463457 'MANE_SELECT' in csq_record .keys ():
464458 varkey_info = str (csq_record ['VARKEY' ]).split ('_' )
465- if not csq_record ['CDS_START' ] is None :
459+ if csq_record ['CDS_START' ] is not None :
466460 if len (varkey_info ) == 4 :
467461 csq_record ['CDS_DISTANCE' ] = abs (int (float (csq_record ['CDS_START' ])) - int (csq_record ['VARKEY' ].split ('_' )[1 ]))
468462 cds_dna_alteration = str (varkey_info [2 ]) + '>' + str (varkey_info [3 ])
469463 if csq_record ['STRAND' ] == '-1' :
470464 cds_dna_alteration = reverse_complement_dna (str (varkey_info [2 ])) + '>' + reverse_complement_dna (str (varkey_info [3 ]))
471465 csq_record ['ALTERATION' ] = str ('c.-' + str (csq_record ['CDS_DISTANCE' ]) + str (cds_dna_alteration ))
472466 csq_record ['HGVSc' ] = csq_record ['ENSEMBL_TRANSCRIPT_ID' ] + ':' + csq_record ['ALTERATION' ]
473- if not csq_record ['MANE_SELECT' ] is None :
467+ if csq_record ['MANE_SELECT' ] is not None :
474468 csq_record ['HGVSc_RefSeq' ] = str (csq_record ['MANE_SELECT' ]) + ':' + str (csq_record ['ALTERATION' ])
475469
476470 csq_record ['HGVSp_short' ] = protein_change
477471 exon_number = 'NA'
478- if not csq_record ['EXON' ] is None :
472+ if csq_record ['EXON' ] is not None :
479473 if csq_record ['EXON' ] != '.' :
480474 if '/' in csq_record ['EXON' ]:
481475 exon_number = str (csq_record ['EXON' ]).split ('/' )[0 ]
@@ -487,15 +481,15 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
487481 csq_record ['LAST_EXON' ] = True
488482
489483 if 'INTRON' in csq_record .keys ():
490- if not csq_record ['INTRON' ] is None :
484+ if csq_record ['INTRON' ] is not None :
491485 if csq_record ['INTRON' ] != '.' :
492486 if '/' in csq_record ['INTRON' ]:
493487 intron_number = str (csq_record ['INTRON' ]).split ('/' )[0 ]
494488 num_introns = str (csq_record ['INTRON' ]).split ('/' )[1 ]
495489 if intron_number == num_introns :
496490 csq_record ['LAST_INTRON' ] = True
497491
498- if not csq_record ['HGVSc' ] is None :
492+ if csq_record ['HGVSc' ] is not None :
499493 if csq_record ['HGVSc' ] != '.' :
500494 if protein_change != '.' :
501495 key = str (csq_record ['Consequence' ]) + ':' + \
@@ -510,7 +504,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
510504
511505def make_transcript_xref_map (rec , fieldmap , xref_tag = 'GENE_TRANSCRIPT_XREF' ):
512506 transcript_xref_map = {}
513- if not rec .INFO .get (xref_tag ) is None :
507+ if rec .INFO .get (xref_tag ) is not None :
514508 for tref in rec .INFO .get (xref_tag ).split (',' ):
515509 xrefs = tref .split ('|' )
516510 ensembl_transcript_id = str (xrefs [0 ])
0 commit comments