Skip to content

Commit deea5d5

Browse files
committed
Rule based classification
1 parent 4d5f1f5 commit deea5d5

File tree

1 file changed

+44
-0
lines changed

1 file changed

+44
-0
lines changed

text_analysis.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,47 @@ def is_meaningful_content(text: str):
5959
if text.count("\x0c") / len(text) > PAGE_MARGER_PERCENTAGE_THRESHOLD:
6060
return False
6161
return True
62+
63+
64+
def classify(text: str):
65+
"""
66+
Classifies a text into various categories. Currently supported categories are:
67+
- PAN Card
68+
- Aadhar Card
69+
- Passport
70+
71+
It currently does a simple string matching. We will move to NLP NaiveBayes Classification soon.
72+
And gradually to more advanced classification models.
73+
"""
74+
lowered_text = text.lower()
75+
passport_needed_words = ['india', 'indian', 'surname', 'nationality', 'given', 'name', 'passport', 'date of birth', 'place of birth', 'place of issue', 'date of issue', 'passport no']
76+
passport_found_words = 0
77+
for passport_word in passport_needed_words:
78+
if passport_word in lowered_text:
79+
passport_found_words += 1
80+
if passport_found_words >= 6:
81+
# Also ensure Passport number REGEX is found
82+
return "passport"
83+
pan_needed_words = ['income', 'tax', 'department', 'govt', 'india']
84+
pan_found_words = 0
85+
for pan_word in pan_needed_words:
86+
# Enhance it to make it lenient. For example, 'indome' could be found instead of 'income'
87+
# OCR makes such kind of mistakes and hence accommodation for such must be made.
88+
if pan_word in lowered_text:
89+
pan_found_words += 1
90+
if pan_found_words >= 3:
91+
# TODO: Also make sure that a Regex of form 'AZMPR1111L' is found.
92+
# This text is dark and bold and OCR would have definitely picked it up.
93+
return "pan"
94+
# In Aadhaar Card, Government of India is shaded, hence binarization causes it not to be read properly.
95+
aadhaar_needed_words = ['issue', 'date']
96+
aadhaar_found_words = 0
97+
for aadhaar_word in aadhaar_needed_words:
98+
# Enhance it to make it lenient. For example, 'isdue' could be found instead of 'issue'
99+
if aadhaar_word in lowered_text:
100+
aadhaar_found_words += 1
101+
if aadhaar_found_words >= 1:
102+
# aadhaar has a regex of form 1234 1234 1234
103+
# This finding is a must because it is dark and bold and OCR would have definitely picked it up.
104+
return "aadhaar"
105+
return None

0 commit comments

Comments
 (0)