@@ -59,3 +59,47 @@ def is_meaningful_content(text: str):
59
59
if text .count ("\x0c " ) / len (text ) > PAGE_MARGER_PERCENTAGE_THRESHOLD :
60
60
return False
61
61
return True
62
+
63
+
64
+ def classify (text : str ):
65
+ """
66
+ Classifies a text into various categories. Currently supported categories are:
67
+ - PAN Card
68
+ - Aadhar Card
69
+ - Passport
70
+
71
+ It currently does a simple string matching. We will move to NLP NaiveBayes Classification soon.
72
+ And gradually to more advanced classification models.
73
+ """
74
+ lowered_text = text .lower ()
75
+ passport_needed_words = ['india' , 'indian' , 'surname' , 'nationality' , 'given' , 'name' , 'passport' , 'date of birth' , 'place of birth' , 'place of issue' , 'date of issue' , 'passport no' ]
76
+ passport_found_words = 0
77
+ for passport_word in passport_needed_words :
78
+ if passport_word in lowered_text :
79
+ passport_found_words += 1
80
+ if passport_found_words >= 6 :
81
+ # Also ensure Passport number REGEX is found
82
+ return "passport"
83
+ pan_needed_words = ['income' , 'tax' , 'department' , 'govt' , 'india' ]
84
+ pan_found_words = 0
85
+ for pan_word in pan_needed_words :
86
+ # Enhance it to make it lenient. For example, 'indome' could be found instead of 'income'
87
+ # OCR makes such kind of mistakes and hence accommodation for such must be made.
88
+ if pan_word in lowered_text :
89
+ pan_found_words += 1
90
+ if pan_found_words >= 3 :
91
+ # TODO: Also make sure that a Regex of form 'AZMPR1111L' is found.
92
+ # This text is dark and bold and OCR would have definitely picked it up.
93
+ return "pan"
94
+ # In Aadhaar Card, Government of India is shaded, hence binarization causes it not to be read properly.
95
+ aadhaar_needed_words = ['issue' , 'date' ]
96
+ aadhaar_found_words = 0
97
+ for aadhaar_word in aadhaar_needed_words :
98
+ # Enhance it to make it lenient. For example, 'isdue' could be found instead of 'issue'
99
+ if aadhaar_word in lowered_text :
100
+ aadhaar_found_words += 1
101
+ if aadhaar_found_words >= 1 :
102
+ # aadhaar has a regex of form 1234 1234 1234
103
+ # This finding is a must because it is dark and bold and OCR would have definitely picked it up.
104
+ return "aadhaar"
105
+ return None
0 commit comments