ML.NaiveBayes
1from .model import Model 2 3from nltk import pos_tag 4from nltk import NaiveBayesClassifier 5from nltk import classify 6from nltk import TweetTokenizer 7 8from nltk.sentiment.vader import SentimentIntensityAnalyzer 9from nltk.stem import WordNetLemmatizer 10from nltk.corpus import stopwords 11 12from random import shuffle 13from statistics import mean 14 15import joblib 16 17class NaiveBayes(Model): 18 def __init__(self) -> None: 19 """" 20 Builds Vectoriser and stopwords object to be used in _preprocess. 21 """ 22 super().__init__() 23 # TD: should try to make stopwords static, and data needed for objects static also. 24 self._sia = SentimentIntensityAnalyzer() 25 self._lemmatizer = WordNetLemmatizer() 26 27 self._tokenizer = TweetTokenizer(preserve_case=False, 28 strip_handles=True, 29 reduce_len=True) 30 31 self._TAGMAP = {'V' : 'v', 'J' : 'a', 'N' : 'n', 'R' : 'r' } 32 33 self._STOPWORDS = set(stopwords.words("english")) 34 35 def _preprocess(self, tweet): 36 """ 37 Cleans data before sentiment analysis, including removing stopwords and alphas. 38 Args: 39 tweet(str): tweet data of string 40 Returns: 41 data (list(str)): data that is vectorised and cleaned 42 """ 43 data = self._tokenizer.tokenize(tweet) 44 45 data = [token for token in data if token.isalpha() and token not in self._STOPWORDS] 46 47 # (low) TD: Pull request pos tag (tagset) to work with lemmatize 48 data = pos_tag(data) 49 50 # (low) TD: refact if data + lemmatize + pos_tag 51 52 data = [self._lemmatizer.lemmatize(token, self._TAGMAP.get(pos, 'n')) for token, pos in data] 53 54 return data 55 56 def _features(self, tweet): 57 """ 58 Calcs the VADER score (from lexicon data) 59 Args: 60 tweet(str): tweet data of string 61 Returns: 62 features(dict): dictionary of pos and comp score as keys, to 0-1 float. 63 """ 64 data = self._preprocess(tweet) 65 66 if not data: 67 return {} 68 69 features = {} 70 positive_scores = [] 71 compound_scores = [] 72 73 for word in data: 74 positive_scores.append(self._sia.polarity_scores(word)["pos"]) 75 compound_scores.append(self._sia.polarity_scores(word)["compound"]) 76 features['pos_score'] = mean(positive_scores) 77 features['comp_score'] = mean(compound_scores) 78 return features 79 80 def _trainmodel(self): 81 """ 82 Using NLP's NaiveBayes probablity classifier, use labeled data to build 83 a pkl file. 84 """ 85 features = [] 86 for tweet in self.pos_data: 87 features.append((self._features(tweet), 'p')) 88 89 for tweet in self.neg_data: 90 features.append((self._features(tweet), 'n')) 91 92 shuffle(features) 93 classifier = NaiveBayesClassifier.train(labeled_featuresets=features[:1500]) 94 joblib.dump(classifier, r'backend\ML\models\NaiveBayes.pkl') 95 96 async def predict(self, tweet): 97 """ 98 By using the trained pkl file in models, classify a single tweet 99 Args: 100 tweet (str) : text data inside of tweet. 101 Returns: 102 'p' to indicate positive and 'n' to indicate negative 103 """ 104 print(tweet) 105 tweet = self._features(tweet) 106 result = joblib.load(r'backend\ML\models\NaiveBayes.pkl').classify(tweet) 107 return result 108 109if __name__ == "__main__": 110 from nltk import download 111 download(['wordnet'])
18class NaiveBayes(Model): 19 def __init__(self) -> None: 20 """" 21 Builds Vectoriser and stopwords object to be used in _preprocess. 22 """ 23 super().__init__() 24 # TD: should try to make stopwords static, and data needed for objects static also. 25 self._sia = SentimentIntensityAnalyzer() 26 self._lemmatizer = WordNetLemmatizer() 27 28 self._tokenizer = TweetTokenizer(preserve_case=False, 29 strip_handles=True, 30 reduce_len=True) 31 32 self._TAGMAP = {'V' : 'v', 'J' : 'a', 'N' : 'n', 'R' : 'r' } 33 34 self._STOPWORDS = set(stopwords.words("english")) 35 36 def _preprocess(self, tweet): 37 """ 38 Cleans data before sentiment analysis, including removing stopwords and alphas. 39 Args: 40 tweet(str): tweet data of string 41 Returns: 42 data (list(str)): data that is vectorised and cleaned 43 """ 44 data = self._tokenizer.tokenize(tweet) 45 46 data = [token for token in data if token.isalpha() and token not in self._STOPWORDS] 47 48 # (low) TD: Pull request pos tag (tagset) to work with lemmatize 49 data = pos_tag(data) 50 51 # (low) TD: refact if data + lemmatize + pos_tag 52 53 data = [self._lemmatizer.lemmatize(token, self._TAGMAP.get(pos, 'n')) for token, pos in data] 54 55 return data 56 57 def _features(self, tweet): 58 """ 59 Calcs the VADER score (from lexicon data) 60 Args: 61 tweet(str): tweet data of string 62 Returns: 63 features(dict): dictionary of pos and comp score as keys, to 0-1 float. 64 """ 65 data = self._preprocess(tweet) 66 67 if not data: 68 return {} 69 70 features = {} 71 positive_scores = [] 72 compound_scores = [] 73 74 for word in data: 75 positive_scores.append(self._sia.polarity_scores(word)["pos"]) 76 compound_scores.append(self._sia.polarity_scores(word)["compound"]) 77 features['pos_score'] = mean(positive_scores) 78 features['comp_score'] = mean(compound_scores) 79 return features 80 81 def _trainmodel(self): 82 """ 83 Using NLP's NaiveBayes probablity classifier, use labeled data to build 84 a pkl file. 85 """ 86 features = [] 87 for tweet in self.pos_data: 88 features.append((self._features(tweet), 'p')) 89 90 for tweet in self.neg_data: 91 features.append((self._features(tweet), 'n')) 92 93 shuffle(features) 94 classifier = NaiveBayesClassifier.train(labeled_featuresets=features[:1500]) 95 joblib.dump(classifier, r'backend\ML\models\NaiveBayes.pkl') 96 97 async def predict(self, tweet): 98 """ 99 By using the trained pkl file in models, classify a single tweet 100 Args: 101 tweet (str) : text data inside of tweet. 102 Returns: 103 'p' to indicate positive and 'n' to indicate negative 104 """ 105 print(tweet) 106 tweet = self._features(tweet) 107 result = joblib.load(r'backend\ML\models\NaiveBayes.pkl').classify(tweet) 108 return result
NaiveBayes()
19 def __init__(self) -> None: 20 """" 21 Builds Vectoriser and stopwords object to be used in _preprocess. 22 """ 23 super().__init__() 24 # TD: should try to make stopwords static, and data needed for objects static also. 25 self._sia = SentimentIntensityAnalyzer() 26 self._lemmatizer = WordNetLemmatizer() 27 28 self._tokenizer = TweetTokenizer(preserve_case=False, 29 strip_handles=True, 30 reduce_len=True) 31 32 self._TAGMAP = {'V' : 'v', 'J' : 'a', 'N' : 'n', 'R' : 'r' } 33 34 self._STOPWORDS = set(stopwords.words("english"))
" Builds Vectoriser and stopwords object to be used in _preprocess.
async def
predict(self, tweet):
97 async def predict(self, tweet): 98 """ 99 By using the trained pkl file in models, classify a single tweet 100 Args: 101 tweet (str) : text data inside of tweet. 102 Returns: 103 'p' to indicate positive and 'n' to indicate negative 104 """ 105 print(tweet) 106 tweet = self._features(tweet) 107 result = joblib.load(r'backend\ML\models\NaiveBayes.pkl').classify(tweet) 108 return result
By using the trained pkl file in models, classify a single tweet Args: tweet (str) : text data inside of tweet. Returns: 'p' to indicate positive and 'n' to indicate negative