krishnaik06 · SoumyoNathTripathy · Feb 10, 2022 · Feb 11, 2022 · Feb 11, 2022 · Feb 11, 2022
diff --git a/BagofWords.py b/BagofWords.py
@@ -32,28 +32,28 @@
                I see four milestones in my career"""
 
 
-# Cleaning the texts
-import re
+# Cleaning the texts (DATA CLEANING)
+import re # for regular expression
 from nltk.corpus import stopwords
 from nltk.stem.porter import PorterStemmer
 from nltk.stem import WordNetLemmatizer
 
-ps = PorterStemmer()
+ps = PorterStemmer() #we will  generally perform lemmatization only tho
 wordnet=WordNetLemmatizer()
 sentences = nltk.sent_tokenize(paragraph)
-corpus = []
+corpus = [] #reason- After cleaning the text we will keep the text here
 for i in range(len(sentences)):
-    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
-    review = review.lower()
-    review = review.split()
-    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
-    review = ' '.join(review)
-    corpus.append(review)
+    review = re.sub('[^a-zA-Z]', ' ', sentences[i]) #apart from a-z or A-Z re.sub replaces all characters like ", . / ?" with" "; sentences[i]) means applying for each sentences gathered after tokenization like arr[i]
+    review = review.lower() # lowwering the sentence
+    review = review.split() #.split means() means splitting up the sentences into list of words basically word tokenizastion
+    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # now if the words are not stopwords they are stemmatized and the stopwords are removed
+    review = ' '.join(review) #finally words rejoin to give the resultant sentence
+    corpus.append(review) # this sentences we put in the blank list "corpus"
 
-# Creating the Bag of Words model
-from sklearn.feature_extraction.text import CountVectorizer
-cv = CountVectorizer(max_features = 1500)
-X = cv.fit_transform(corpus).toarray()
+# Creating the Bag of Words model (FEATURE EXTRACTION)
+from sklearn.feature_extraction.text import CountVectorizer # CountVectorizer for creating BOW from scikit learn
+cv = CountVectorizer(max_features = 1500)  #cv is just an object for CountVectorizer (max_features = 1500---limiting the max no of features that will be created after BOW to 1500)
+X = cv.fit_transform(corpus).toarray() #cv.fit_transform(list of sentences after sent_tokenize of paragraph).toarray()
 
 
 

diff --git a/Lemmatization.py b/Lemmatization.py
@@ -34,10 +34,11 @@
 
 
 sentences = nltk.sent_tokenize(paragraph)
-lemmatizer = WordNetLemmatizer()
+lemmatizer = WordNetLemmatizer() #WordNetLemmatizer() for lemmatization
 
 # Lemmatization
 for i in range(len(sentences)):
-    words = nltk.word_tokenize(sentences[i])
+    words = nltk.word_tokenize(sentences[i]) # taking each word of each sentences
     words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
-    sentences[i] = ' '.join(words)      
+    #words = [lemmatizer.lemmatize(word)     (for word in words){ if word not in set(stopwords.words('english'))]} If the words are not stopwords they atre lemmatized
+    sentences[i] = ' '.join(words)      
diff --git a/README.md b/README.md
@@ -1 +1,5 @@
-# Natural-Language-Processing
+# Natural-Language-Processing(NLP)
+
+
+
+-SOUMYO NATH TRIPATHY
diff --git a/Stemming.py b/Stemming.py
@@ -34,14 +34,16 @@
                I see four milestones in my career"""
 
 
-sentences = nltk.sent_tokenize(paragraph)
-stemmer = PorterStemmer()
+sentences = nltk.sent_tokenize(paragraph) #tokenizing paragraph to sentence
+stemmer = PorterStemmer() #object "stemmer" for PorterStemmer
 
 # Stemming
-for i in range(len(sentences)):
-    words = nltk.word_tokenize(sentences[i])
-    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
-    sentences[i] = ' '.join(words)   
+for i in range(len(sentences)): #for loop from 1-31(because after the tokenizing paragraph we get a list of 31 sentences)
+    words = nltk.word_tokenize(sentences[i]) #now taking each word of each sentence
+    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]  
+    #words = {[stemmer.stem(word)} {for word in words {if word not in set(stopwords.words('english'))] }} which means the words get stemmatized only if it isnt a STOPWORD
+    sentences[i] = ' '.join(words)   #joing back the words to get the resultant sentence
+
 
 
 
@@ -50,4 +52,3 @@
 
 
 
-
diff --git a/TFIDF.py b/TFIDF.py
@@ -1,8 +1,4 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Jan  2 22:19:40 2019
-@author: Krish.Naik
-"""
+
 
 import nltk
 
@@ -50,6 +46,6 @@
     corpus.append(review)
 
 # Creating the TF-IDF model
-from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_extraction.text import TfidfVectorizer #TfidfVectorizer is used for TFIDF , rest is all same
 cv = TfidfVectorizer()
 X = cv.fit_transform(corpus).toarray()
diff --git a/word2vec.py b/word2vec.py
@@ -1,9 +1,4 @@
-# -*- coding: utf-8 -*-
-"""
-Created on Sun Nov 24 12:55:52 2019
 
-@author: krish.naik
-"""
 import nltk
 
 from gensim.models import Word2Vec
@@ -63,4 +58,4 @@
 vector = model.wv['war']
 
 # Most similar words
-similar = model.wv.most_similar('vikram')
+similar = model.wv.most_similar('vikram')