Skip to content

Commit 76369e0

Browse files
committed
Added basic conversation ability using spaCy
1 parent d865ca9 commit 76369e0

File tree

2 files changed

+149
-0
lines changed

2 files changed

+149
-0
lines changed

language_processing.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""
2+
Responsible for doing Natural Language Processing.
3+
It should ideally operate on the extracted text.
4+
5+
It should have ability to perform things like:
6+
- Parts of Speech tagging.
7+
- Named Entity Recognition
8+
- Finding digits, words etc.
9+
- Stopwords removal
10+
- Compute Lexical Diversity
11+
12+
Later, we want it to perform:
13+
- Summarization
14+
- Answer basic question
15+
"""
16+
17+
import logging
18+
import spacy
19+
20+
logger = logging.getLogger(__name__)
21+
22+
nlp = spacy.load("en_core_web_sm")
23+
24+
25+
def parts_of_speech(text: str):
26+
"""
27+
Extracts parts of speech from the text
28+
"""
29+
nouns = []
30+
verbs = []
31+
doc = nlp(text)
32+
for token in doc:
33+
if token.pos_ == "PROPN":
34+
nouns.append(token)
35+
elif token.pos_ == "VERB":
36+
verbs.append(token)
37+
data = {
38+
"nouns": nouns,
39+
"verbs": verbs
40+
}
41+
return data
42+
43+
44+
def entities(text: str):
45+
doc = nlp(text)
46+
ents = [ent.text for ent in doc.ents]
47+
return ents
48+
49+
50+
def remove_punctuations(text: str):
51+
doc = nlp(text)
52+
return [token.text for token in doc if not token.is_punct]
53+
54+
55+
def remove_stopwords(text: str):
56+
doc = nlp(text)
57+
return [token.text for token in doc if not token.is_stop]
58+
59+
60+
def remove_punctuations_and_stopwords(text: str):
61+
doc = nlp(text)
62+
tokens = []
63+
for token in doc:
64+
if not token.is_stop and not token.is_punct:
65+
tokens.append(token)
66+
return tokens
67+
68+
69+
def summarize(text: str):
70+
pass
71+
72+
73+
def converse(text: str, question: str):
74+
"""
75+
Answer simple questions like:
76+
- Who did something?
77+
- When something happened?
78+
- How much does something take.
79+
- Where did something happen.
80+
81+
The following constructs will play a role here:
82+
- Parts of Speech (POS tagging)
83+
- Named Entities (NER)
84+
- Syntactic Dependencies (dep_)
85+
- Rule based matching. In addition to regex, use token attributes like is_punct, is_stop etc.
86+
"""
87+
proper_nouns = []
88+
verbs = []
89+
subjects = []
90+
objects = []
91+
prepositions = []
92+
numerics = []
93+
dates = []
94+
doc = nlp(text)
95+
lowered_question = question.lower()
96+
for token in doc:
97+
logger.info(f"Token: {token.text}, POS: {token.pos_}")
98+
if token.pos_ == "PROPN":
99+
proper_nouns.append(token)
100+
if token.pos_ == "VERB":
101+
verbs.append(token)
102+
if token.dep_ == "nsubj":
103+
subjects.append(token)
104+
if token.dep_ == "pobj":
105+
objects.append(token)
106+
if token.pos_ == 'ADP':
107+
prepositions.append(token)
108+
if token.like_num:
109+
numerics.append(token)
110+
for ent in doc.ents:
111+
logger.info(f"Entity: {ent.text}, Type: {ent.label_}")
112+
if ent.label_ == "DATE":
113+
dates.append(ent)
114+
logger.info(f"Nouns: {proper_nouns}")
115+
logger.info(f"Verbs: {verbs}")
116+
logger.info(f"Subjects: {subjects}")
117+
logger.info(f"Prepositions: {prepositions}")
118+
if "who" in lowered_question:
119+
# The answer should probably be a proper noun.
120+
if len(proper_nouns) == 1:
121+
return proper_nouns[0].text
122+
# If there are multiple nouns, then most probably the subject instead of the object is the answer.
123+
# Hence dependency parsing can help us get that.
124+
# We are currently dealing with single sentences.
125+
# TODO: Modify it to get more context from the question, and then infer the correct subject
126+
return subjects[0]
127+
if "where" in lowered_question:
128+
# It means we want a place as answer
129+
# The answer should probably be a noun
130+
# Very likely it is followed by a prepositional phrase.
131+
# Examples: They went "to" Colombo, kept on "the" table. etc.
132+
if len(objects) > 0:
133+
return objects[0]
134+
# Statements like "apaar went to play"
135+
# Here play is not an object. So use the token appearing right after preposition
136+
if len(prepositions) > 0:
137+
prep = prepositions[0]
138+
return doc[prep.i + 1]
139+
if "how much" in lowered_question:
140+
# A quantity has to be returned
141+
# A quantity would mean a numeric
142+
if len(numerics) > 0:
143+
return numerics[0]
144+
if "when" in lowered_question:
145+
# A date has to be returned
146+
if len(dates) > 0:
147+
return dates[0]
148+
return None

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ rq # Redis-queue for task/worker setup
1111
Pillow # To perform image preprocessing before OCR. Performs grayscale conversion, denoising and binarization
1212
opencv-contrib-python # To perfomr image preprocessing
1313
python-Levenshtein # Fuzzy search during classification and structured data extraction
14+
spacy # Production-grade NLP instead of research-oriented NLP provided by nltk

0 commit comments

Comments
 (0)