Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions BagofWords.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,28 @@
I see four milestones in my career"""


# Cleaning the texts
import re
# Cleaning the texts (DATA CLEANING)
import re # for regular expression
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
ps = PorterStemmer() #we will generally perform lemmatization only tho
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus = []
corpus = [] #reason- After cleaning the text we will keep the text here
for i in range(len(sentences)):
review = re.sub('[^a-zA-Z]', ' ', sentences[i])
review = review.lower()
review = review.split()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus.append(review)
review = re.sub('[^a-zA-Z]', ' ', sentences[i]) #apart from a-z or A-Z re.sub replaces all characters like ", . / ?" with" "; sentences[i]) means applying for each sentences gathered after tokenization like arr[i]
review = review.lower() # lowwering the sentence
review = review.split() #.split means() means splitting up the sentences into list of words basically word tokenizastion
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # now if the words are not stopwords they are stemmatized and the stopwords are removed
review = ' '.join(review) #finally words rejoin to give the resultant sentence
corpus.append(review) # this sentences we put in the blank list "corpus"

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
# Creating the Bag of Words model (FEATURE EXTRACTION)
from sklearn.feature_extraction.text import CountVectorizer # CountVectorizer for creating BOW from scikit learn
cv = CountVectorizer(max_features = 1500) #cv is just an object for CountVectorizer (max_features = 1500---limiting the max no of features that will be created after BOW to 1500)
X = cv.fit_transform(corpus).toarray() #cv.fit_transform(list of sentences after sent_tokenize of paragraph).toarray()



Expand Down
7 changes: 4 additions & 3 deletions Lemmatization.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,11 @@


sentences = nltk.sent_tokenize(paragraph)
lemmatizer = WordNetLemmatizer()
lemmatizer = WordNetLemmatizer() #WordNetLemmatizer() for lemmatization

# Lemmatization
for i in range(len(sentences)):
words = nltk.word_tokenize(sentences[i])
words = nltk.word_tokenize(sentences[i]) # taking each word of each sentences
words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
sentences[i] = ' '.join(words)
#words = [lemmatizer.lemmatize(word) (for word in words){ if word not in set(stopwords.words('english'))]} If the words are not stopwords they atre lemmatized
sentences[i] = ' '.join(words)
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
# Natural-Language-Processing
# Natural-Language-Processing(NLP)



-SOUMYO NATH TRIPATHY
15 changes: 8 additions & 7 deletions Stemming.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@
I see four milestones in my career"""


sentences = nltk.sent_tokenize(paragraph)
stemmer = PorterStemmer()
sentences = nltk.sent_tokenize(paragraph) #tokenizing paragraph to sentence
stemmer = PorterStemmer() #object "stemmer" for PorterStemmer

# Stemming
for i in range(len(sentences)):
words = nltk.word_tokenize(sentences[i])
words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
sentences[i] = ' '.join(words)
for i in range(len(sentences)): #for loop from 1-31(because after the tokenizing paragraph we get a list of 31 sentences)
words = nltk.word_tokenize(sentences[i]) #now taking each word of each sentence
words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
#words = {[stemmer.stem(word)} {for word in words {if word not in set(stopwords.words('english'))] }} which means the words get stemmatized only if it isnt a STOPWORD
sentences[i] = ' '.join(words) #joing back the words to get the resultant sentence




Expand All @@ -50,4 +52,3 @@




8 changes: 2 additions & 6 deletions TFIDF.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 2 22:19:40 2019
@author: Krish.Naik
"""


import nltk

Expand Down Expand Up @@ -50,6 +46,6 @@
corpus.append(review)

# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer is used for TFIDF , rest is all same
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
7 changes: 1 addition & 6 deletions word2vec.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 24 12:55:52 2019

@author: krish.naik
"""
import nltk

from gensim.models import Word2Vec
Expand Down Expand Up @@ -63,4 +58,4 @@
vector = model.wv['war']

# Most similar words
similar = model.wv.most_similar('vikram')
similar = model.wv.most_similar('vikram')