|
| 1 | +import joblib |
| 2 | +from sklearn.feature_extraction.text import CountVectorizer |
| 3 | +from sklearn.naive_bayes import MultinomialNB |
| 4 | +import pandas as pd |
| 5 | +from src.algo.data_preprocessor import text_cleaner |
| 6 | + |
| 7 | + |
| 8 | +# for production level, we need shorter processing time as much as possible |
| 9 | +# so, making joblib's pickle file of the training model instead of performing calculation every time |
| 10 | +def production_multinomial(testing_data, layer): |
| 11 | + # Deserializing CV's pkl file to object in runtime env |
| 12 | + pickled_count_vectorizer = CountVectorizer() |
| 13 | + if (layer == "sarcasm"): |
| 14 | + pickled_count_vectorizer = joblib.load( |
| 15 | + 'src/algo/sarcasmpickle_countvectorizer.pkl') |
| 16 | + if (layer == "spam"): |
| 17 | + pickled_count_vectorizer = joblib.load( |
| 18 | + 'src/algo/spampickle_countvectorizer.pkl') |
| 19 | + X_test = pickled_count_vectorizer.transform(testing_data) |
| 20 | + |
| 21 | + # decerializing MN's pkl file to object in runtime env |
| 22 | + pickled_multinomial_nv = MultinomialNB() |
| 23 | + if (layer == "sarcasm"): |
| 24 | + pickled_multinomial_nv = joblib.load( |
| 25 | + 'src/algo/sarcasmpickle_multinomial.pkl') |
| 26 | + if (layer == "spam"): |
| 27 | + pickled_multinomial_nv = joblib.load( |
| 28 | + 'src/algo/spampickle_multinomial.pkl') |
| 29 | + prediction_of_each_data = pickled_multinomial_nv.predict( |
| 30 | + X_test).tolist() # converted numpyarray to list |
| 31 | + # returns list of 1 or 0 items where 1 for yes and 0 for no |
| 32 | + return prediction_of_each_data |
| 33 | + |
| 34 | + |
| 35 | +# this debug function is needed to update our training model if new data are added to Sentimento's training datasets |
| 36 | +# this function will perform count vectoization calculation for training data too which takes longer time than using pickled data |
| 37 | +def debug_multinomial(testing_data, layer): |
| 38 | + training_data = None |
| 39 | + preprocessed_training_data = [] |
| 40 | + training_label = [] |
| 41 | + if (layer == "spam"): |
| 42 | + training_data = pd.read_csv('src.algo.spam_training.csv').values |
| 43 | + for each in training_data: |
| 44 | + preprocessed_training_data.append(text_cleaner(each[3])) |
| 45 | + training_label.append(each[4]) |
| 46 | + if (layer == "sarcasm"): |
| 47 | + training_data = pd.read_csv('src.algo.sarcasm_training.csv').values |
| 48 | + for each in training_data: |
| 49 | + preprocessed_training_data.append(text_cleaner(each[0])) |
| 50 | + training_label.append(each[1]) |
| 51 | + |
| 52 | + # now count vectorizing part |
| 53 | + cv = CountVectorizer(ngram_range=(1, 2)) |
| 54 | + X_train = cv.fit_transform(preprocessed_training_data) |
| 55 | + |
| 56 | + # serialization |
| 57 | + # Save the model as a pickle in a file |
| 58 | + # unccoment this parts to dump in pickle file for production use |
| 59 | + if(layer == "spam"): |
| 60 | + joblib.dump(cv, 'spampickle_countvectorizer.pkl') |
| 61 | + if (layer == "sarcasm"): |
| 62 | + joblib.dump(cv, 'sarcasmpickle_countvectorizer.pkl') |
| 63 | + |
| 64 | + X_test = cv.transform(testing_data) |
| 65 | + mn = MultinomialNB() |
| 66 | + mn.fit(X_train, training_label) |
| 67 | + |
| 68 | + # serialization |
| 69 | + # unccoment this parts to dump in pickle file for production use |
| 70 | + if(layer == "spam"): |
| 71 | + joblib.dump(mn, 'spampickle_multinomial.pkl') |
| 72 | + if (layer == "sarcasm"): |
| 73 | + joblib.dump(mn, 'sarcasmpickle_multinomial.pkl') |
| 74 | + |
| 75 | + prediction_of_each_data = mn.predict(X_test).tolist() |
| 76 | + return prediction_of_each_data |
0 commit comments