Skip to content

Commit b573a8e

Browse files
authored
Merge pull request #280 from sigven/sep2025_cna_patch
Sep2025 cna patch
2 parents bcae4e9 + 2757b6f commit b573a8e

30 files changed

+303
-242
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](https://c
2828

2929
### Top News
3030

31+
- *September 8th 2025:* **2.2.4 release**
32+
- various minor bug fixes, addition of `--sex` option for sex-adjusted CNA annotation
33+
- [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html)
34+
3135
- *July 18th 2025:* **2.2.3 release**
3236
- Ensembl VEP `v113.4`
3337

pcgr/annoutils.py

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import os
44
import re
5-
import sys
65
import csv
76
import cyvcf2
87
import gzip
@@ -11,7 +10,7 @@
1110
from logging import Logger
1211

1312
from pcgr import utils, pcgr_vars
14-
from pcgr.utils import check_subprocess, error_message
13+
from pcgr.utils import error_message
1514
from pcgr.variant import reverse_complement_dna
1615

1716
csv.field_size_limit(500 * 1024 * 1024)
@@ -37,7 +36,7 @@ def read_infotag_file(vcf_info_tags_tsv, scope = "vep"):
3736
tsvfile = open(vcf_info_tags_tsv, 'r')
3837
reader = csv.DictReader(tsvfile, delimiter='\t')
3938
for row in reader:
40-
if not row['tag'] in info_tag_xref:
39+
if row['tag'] not in info_tag_xref:
4140
if scope in row['category']:
4241
#print(scope + '\t' + str(row['category']) + '\t' + str(row['tag']))
4342
info_tag_xref[row['tag']] = row
@@ -70,7 +69,7 @@ def read_vcfanno_tag_file(vcfanno_tag_file, logger):
7069
if elem.startswith('Description'):
7170
row['description'] = re.sub(r'">|Description="', '', elem)
7271

73-
if 'tag' in row and not row['tag'] in infotag_results:
72+
if 'tag' in row and row['tag'] not in infotag_results:
7473
infotag_results[row['tag']] = row
7574

7675
return(infotag_results)
@@ -147,7 +146,7 @@ def map_regulatory_variant_annotations(vep_csq_records):
147146
while j < len(vep_csq_records):
148147
missing_key_annotation = False
149148
for k in ['Feature_type', 'Consequence', 'Feature']:
150-
if not k in vep_csq_records[j].keys():
149+
if k not in vep_csq_records[j].keys():
151150
missing_key_annotation = True
152151

153152
if missing_key_annotation is False:
@@ -170,7 +169,7 @@ def map_regulatory_variant_annotations(vep_csq_records):
170169
missing_motif_annotation = False
171170
for annotation in ['MOTIF_NAME', 'MOTIF_POS', 'HIGH_INF_POS',
172171
'MOTIF_SCORE_CHANGE', 'TRANSCRIPTION_FACTORS']:
173-
if not annotation in vep_csq_records[j].keys():
172+
if annotation not in vep_csq_records[j].keys():
174173
missing_motif_annotation = True
175174

176175
if missing_motif_annotation is False:
@@ -263,20 +262,15 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
263262
if re.search(pcgr_vars.CSQ_NULL_PATTERN, str(csq_record['Consequence'])) is not None:
264263
csq_record['NULL_VARIANT'] = True
265264

266-
if not csq_record['MaxEntScan_diff'] is None and not csq_record['MaxEntScan_ref'] is None and not csq_record['MaxEntScan_alt'] is None:
267-
fraction_drop = 0.0
268-
#if float(csq_record['MaxEntScan_ref']) > 0:
269-
# fraction_drop = float(csq_record['MaxEntScan_diff']) / float(csq_record['MaxEntScan_ref']).round(4)
270-
#else:
271-
# fraction_drop = 0.0
265+
if csq_record['MaxEntScan_diff'] is not None and csq_record['MaxEntScan_ref'] is not None and csq_record['MaxEntScan_alt'] is not None:
272266
csq_record['MAXENTSCAN'] = 'MES|' + str(csq_record['MaxEntScan_diff']) + '|' + \
273267
str(csq_record['MaxEntScan_ref']) + '|' + str(csq_record['MaxEntScan_alt']) #+ \
274268
#'|' + str(fraction_drop)
275269

276270
if re.search(pcgr_vars.CSQ_SPLICE_DONOR_PATTERN, str(csq_record['Consequence'])) is not None:
277271
if re.search(r'(\+3(A|G)>|\+4A>|\+5G>)', str(csq_record['HGVSc'])) is not None:
278272
csq_record['SPLICE_DONOR_RELEVANT'] = True
279-
if not csq_record['MaxEntScan_diff'] is None and re.search('splice_donor_(5th|variant)',str(csq_record['Consequence'])) is not None:
273+
if csq_record['MaxEntScan_diff'] is not None and re.search('splice_donor_(5th|variant)',str(csq_record['Consequence'])) is not None:
280274
if abs(csq_record['MaxEntScan_diff']) >= pcgr_vars.DONOR_DISRUPTION_MES_CUTOFF:
281275
csq_record['LOSS_OF_FUNCTION'] = True
282276
csq_record['MAXENTSCAN'] = str(csq_record['MAXENTSCAN']) + '|DONOR_DISRUPTING'
@@ -287,7 +281,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
287281
csq_record['MAXENTSCAN'] = str(csq_record['MAXENTSCAN']) + '|NON_DONOR_DISRUPTING'
288282

289283
if re.search(pcgr_vars.CSQ_SPLICE_ACCEPTOR_PATTERN, str(csq_record['Consequence'])) is not None:
290-
if not csq_record['MaxEntScan_diff'] is None and re.search('splice_acceptor', str(csq_record['Consequence'])) is not None:
284+
if csq_record['MaxEntScan_diff'] is not None and re.search('splice_acceptor', str(csq_record['Consequence'])) is not None:
291285
if abs(csq_record['MaxEntScan_diff']) >= pcgr_vars.ACCEPTOR_DISRUPTION_MES_CUTOFF:
292286
csq_record['LOSS_OF_FUNCTION'] = True
293287
csq_record['MAXENTSCAN'] = str(csq_record['MAXENTSCAN']) + '|ACCEPTOR_DISRUPTING'
@@ -307,7 +301,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
307301
csq_record['INTRON_POSITION'] = int(pos)
308302

309303
if 'NearestExonJB' in csq_record.keys():
310-
if not csq_record['NearestExonJB'] is None:
304+
if csq_record['NearestExonJB'] is not None:
311305
if re.search(r"synonymous_|missense_|stop_|frameshift|inframe_|start_", str(csq_record['Consequence'])) is not None and \
312306
str(csq_record['NearestExonJB']) != "":
313307
exon_pos_info = csq_record['NearestExonJB'].split("+")
@@ -319,7 +313,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
319313

320314
## filter putative LOF variants if they occur too close to the CDS end (less than 5% of the CDS length remains after the variant)
321315
if 'CDS_position' in csq_record.keys():
322-
if not csq_record['CDS_position'] is None and csq_record['LOSS_OF_FUNCTION'] is True and splice_variant is False:
316+
if csq_record['CDS_position'] is not None and csq_record['LOSS_OF_FUNCTION'] is True and splice_variant is False:
323317
if csq_record['CDS_position'] != '.':
324318
if '/' in csq_record['CDS_position']:
325319
cds_length = str(csq_record['CDS_position']).split('/')[1]
@@ -332,7 +326,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
332326
cds_pos_full = str(csq_record['CDS_position']).split('/')[0]
333327

334328
## Frameshift variants are listed with a range (separated by '-'), choose start position
335-
if '-' in cds_pos_full and not '?' in cds_pos_full:
329+
if '-' in cds_pos_full and '?' not in cds_pos_full:
336330
cds_pos = cds_pos_full.split('-')[0]
337331
if cds_pos.isdigit():
338332
cds_pos = int(cds_pos)
@@ -350,15 +344,15 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
350344
csq_record['LOF_FILTER'] = "END_TRUNCATION"
351345

352346
if 'HGVSc' in csq_record.keys() and 'Consequence' in csq_record.keys():
353-
if not csq_record['HGVSc'] is None:
347+
if csq_record['HGVSc'] is not None:
354348
if csq_record['HGVSc'] != '.':
355349

356350
if len(str(csq_record['HGVSc']).split(':')) == 2:
357351
csq_record['ALTERATION'] = str(csq_record['HGVSc'].split(':')[1])
358352

359353
## Use RefSeq transcript ID (MANE SELECT) for HGVSc if available
360354
csq_record['HGVSc_RefSeq'] = '.:.'
361-
if not csq_record['MANE_SELECT'] is None:
355+
if csq_record['MANE_SELECT'] is not None:
362356
if ":" in csq_record['HGVSc']:
363357
hgvsc_data = csq_record['HGVSc'].split(':')
364358
if len(hgvsc_data) == 2:
@@ -367,22 +361,22 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
367361
## GRCh37 - MANE_SELECT not provided by VEP for GRCh37, so use MANE_SELECT2 (customly provided through geneOncoX)
368362
else:
369363
if 'MANE_SELECT2' in csq_record.keys():
370-
if not csq_record['MANE_SELECT2'] is None:
364+
if csq_record['MANE_SELECT2'] is not None:
371365
if ":" in csq_record['HGVSc']:
372366
hgvsc_data = csq_record['HGVSc'].split(':')
373367
if len(hgvsc_data) == 2:
374368
csq_record['HGVSc_RefSeq'] = str(csq_record['MANE_SELECT2']) + ':' + str(hgvsc_data[1])
375369
else:
376370
if 'REFSEQ_SELECT' in csq_record.keys():
377-
if not csq_record['REFSEQ_SELECT'] is None:
371+
if csq_record['REFSEQ_SELECT'] is not None:
378372
csq_record['REFSEQ_SELECT'] = str(csq_record['REFSEQ_SELECT'].split('&')[0])
379373
if ":" in csq_record['HGVSc']:
380374
hgvsc_data = csq_record['HGVSc'].split(':')
381375
if len(hgvsc_data) == 2:
382376
csq_record['HGVSc_RefSeq'] = str(csq_record['REFSEQ_SELECT']) + ':' + str(hgvsc_data[1])
383377
else:
384378
if 'REFSEQ_TRANSCRIPT_ID' in csq_record.keys():
385-
if not csq_record['REFSEQ_TRANSCRIPT_ID'] is None:
379+
if csq_record['REFSEQ_TRANSCRIPT_ID'] is not None:
386380
csq_record['REFSEQ_TRANSCRIPT_ID'] = str(csq_record['REFSEQ_TRANSCRIPT_ID'].split('&')[0])
387381
if ":" in csq_record['HGVSc']:
388382
hgvsc_data = csq_record['HGVSc'].split(':')
@@ -404,7 +398,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
404398
protein_change = '.'
405399

406400
if 'Protein_position' in csq_record.keys():
407-
if not csq_record['Protein_position'] is None:
401+
if csq_record['Protein_position'] is not None:
408402
if not csq_record['Protein_position'].startswith('-') and csq_record['Protein_position'] != '.':
409403
if '/' in csq_record['Protein_position']:
410404
protein_position = str(csq_record['Protein_position'].split('/')[0])
@@ -419,7 +413,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
419413
csq_record['AMINO_ACID_END'] = protein_position
420414

421415
if 'synonymous_variant' in csq_record['Consequence'] and 'Amino_acids' in csq_record.keys():
422-
if not csq_record['Amino_acids'] is None:
416+
if csq_record['Amino_acids'] is not None:
423417
protein_change = 'p.' + \
424418
str(csq_record['Amino_acids']) + \
425419
str(protein_position) + str(csq_record['Amino_acids'])
@@ -429,7 +423,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
429423
str(protein_position) + \
430424
str(csq_record['Amino_acids']).split('/')[1]
431425

432-
if not csq_record['HGVSp'] is None:
426+
if csq_record['HGVSp'] is not None:
433427
if csq_record['HGVSp'] != '.':
434428
if ':' in csq_record['HGVSp']:
435429
protein_identifier = str(csq_record['HGVSp'].split(':')[0])
@@ -439,7 +433,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
439433

440434
if 'missense_variant' in csq_record['Consequence'] and not re.search(r'(del|ins|fs|X)', protein_change):
441435
if 'Amino_acids' in csq_record.keys():
442-
if not csq_record['Amino_acids'] is None:
436+
if csq_record['Amino_acids'] is not None:
443437
if '/' in str(csq_record['Amino_acids']):
444438
aaref = str(csq_record['Amino_acids']).split('/')[0]
445439
aalt = str(csq_record['Amino_acids']).split('/')[1]
@@ -462,20 +456,20 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
462456
'ENSEMBL_TRANSCRIPT_ID' in csq_record.keys() and \
463457
'MANE_SELECT' in csq_record.keys():
464458
varkey_info = str(csq_record['VARKEY']).split('_')
465-
if not csq_record['CDS_START'] is None:
459+
if csq_record['CDS_START'] is not None:
466460
if len(varkey_info) == 4:
467461
csq_record['CDS_DISTANCE'] = abs(int(float(csq_record['CDS_START'])) - int(csq_record['VARKEY'].split('_')[1]))
468462
cds_dna_alteration = str(varkey_info[2]) + '>' + str(varkey_info[3])
469463
if csq_record['STRAND'] == '-1':
470464
cds_dna_alteration = reverse_complement_dna(str(varkey_info[2])) + '>' + reverse_complement_dna(str(varkey_info[3]))
471465
csq_record['ALTERATION'] = str('c.-' + str(csq_record['CDS_DISTANCE']) + str(cds_dna_alteration))
472466
csq_record['HGVSc'] = csq_record['ENSEMBL_TRANSCRIPT_ID'] + ':' + csq_record['ALTERATION']
473-
if not csq_record['MANE_SELECT'] is None:
467+
if csq_record['MANE_SELECT'] is not None:
474468
csq_record['HGVSc_RefSeq'] = str(csq_record['MANE_SELECT']) + ':' + str(csq_record['ALTERATION'])
475469

476470
csq_record['HGVSp_short'] = protein_change
477471
exon_number = 'NA'
478-
if not csq_record['EXON'] is None:
472+
if csq_record['EXON'] is not None:
479473
if csq_record['EXON'] != '.':
480474
if '/' in csq_record['EXON']:
481475
exon_number = str(csq_record['EXON']).split('/')[0]
@@ -487,15 +481,15 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
487481
csq_record['LAST_EXON'] = True
488482

489483
if 'INTRON' in csq_record.keys():
490-
if not csq_record['INTRON'] is None:
484+
if csq_record['INTRON'] is not None:
491485
if csq_record['INTRON'] != '.':
492486
if '/' in csq_record['INTRON']:
493487
intron_number = str(csq_record['INTRON']).split('/')[0]
494488
num_introns = str(csq_record['INTRON']).split('/')[1]
495489
if intron_number == num_introns:
496490
csq_record['LAST_INTRON'] = True
497491

498-
if not csq_record['HGVSc'] is None:
492+
if csq_record['HGVSc'] is not None:
499493
if csq_record['HGVSc'] != '.':
500494
if protein_change != '.':
501495
key = str(csq_record['Consequence']) + ':' + \
@@ -510,7 +504,7 @@ def assign_cds_exon_intron_annotations(csq_record, grantham_scores, logger):
510504

511505
def make_transcript_xref_map(rec, fieldmap, xref_tag='GENE_TRANSCRIPT_XREF'):
512506
transcript_xref_map = {}
513-
if not rec.INFO.get(xref_tag) is None:
507+
if rec.INFO.get(xref_tag) is not None:
514508
for tref in rec.INFO.get(xref_tag).split(','):
515509
xrefs = tref.split('|')
516510
ensembl_transcript_id = str(xrefs[0])

0 commit comments

Comments
 (0)