ML.NaiveBayes

View Source

  1from .model import Model
  2
  3from nltk import pos_tag
  4from nltk import NaiveBayesClassifier
  5from nltk import classify
  6from nltk import TweetTokenizer
  7
  8from nltk.sentiment.vader import SentimentIntensityAnalyzer
  9from nltk.stem import WordNetLemmatizer
 10from nltk.corpus import stopwords
 11
 12from random import shuffle
 13from statistics import mean
 14
 15import joblib
 16
 17class NaiveBayes(Model):
 18    def __init__(self) -> None:
 19        """"
 20        Builds Vectoriser and stopwords object to be used in  _preprocess.
 21        """
 22        super().__init__()
 23        # TD: should try to make stopwords static, and data needed for objects static also.
 24        self._sia = SentimentIntensityAnalyzer()
 25        self._lemmatizer = WordNetLemmatizer()
 26
 27        self._tokenizer = TweetTokenizer(preserve_case=False,
 28                                        strip_handles=True,
 29                                        reduce_len=True)
 30        
 31        self._TAGMAP = {'V' : 'v', 'J' : 'a', 'N' : 'n', 'R' : 'r' }
 32
 33        self._STOPWORDS = set(stopwords.words("english"))
 34            
 35    def _preprocess(self, tweet):
 36        """
 37        Cleans data before sentiment analysis, including removing stopwords and alphas.
 38        Args:
 39            tweet(str): tweet data of string
 40        Returns:
 41            data (list(str)): data that is vectorised and cleaned
 42        """
 43        data = self._tokenizer.tokenize(tweet)
 44
 45        data = [token for token in data if token.isalpha() and token not in self._STOPWORDS]
 46
 47        # (low) TD: Pull request pos tag (tagset) to work with lemmatize
 48        data = pos_tag(data)
 49    
 50        # (low) TD: refact if data + lemmatize + pos_tag
 51        
 52        data = [self._lemmatizer.lemmatize(token, self._TAGMAP.get(pos, 'n')) for token, pos in data]
 53        
 54        return data
 55    
 56    def _features(self, tweet):
 57        """
 58        Calcs the VADER score (from lexicon data)
 59        Args:
 60            tweet(str): tweet data of string
 61        Returns:
 62            features(dict): dictionary of pos and comp score as keys, to 0-1 float.
 63        """
 64        data = self._preprocess(tweet)
 65        
 66        if not data:
 67            return {}
 68
 69        features = {}
 70        positive_scores = []
 71        compound_scores = []
 72
 73        for word in data:
 74            positive_scores.append(self._sia.polarity_scores(word)["pos"])
 75            compound_scores.append(self._sia.polarity_scores(word)["compound"])
 76        features['pos_score'] = mean(positive_scores)
 77        features['comp_score'] = mean(compound_scores)
 78        return features
 79    
 80    def _trainmodel(self):
 81        """
 82        Using NLP's NaiveBayes probablity classifier, use labeled data to build 
 83        a pkl file.
 84        """
 85        features = []
 86        for tweet in self.pos_data:
 87            features.append((self._features(tweet), 'p'))
 88
 89        for tweet in self.neg_data:
 90            features.append((self._features(tweet), 'n'))
 91
 92        shuffle(features)
 93        classifier = NaiveBayesClassifier.train(labeled_featuresets=features[:1500])
 94        joblib.dump(classifier, r'backend\ML\models\NaiveBayes.pkl')
 95
 96    async def predict(self, tweet):
 97        """
 98        By using the trained pkl file in models, classify a single tweet
 99        Args:
100            tweet (str) : text data inside of tweet.
101        Returns:
102            'p' to indicate positive and 'n' to indicate negative
103        """
104        print(tweet)
105        tweet = self._features(tweet)
106        result = joblib.load(r'backend\ML\models\NaiveBayes.pkl').classify(tweet)
107        return result
108    
109if __name__ == "__main__": 
110    from nltk import download
111    download(['wordnet'])

class NaiveBayes(ML.model.Model): View Source

 18class NaiveBayes(Model):
 19    def __init__(self) -> None:
 20        """"
 21        Builds Vectoriser and stopwords object to be used in  _preprocess.
 22        """
 23        super().__init__()
 24        # TD: should try to make stopwords static, and data needed for objects static also.
 25        self._sia = SentimentIntensityAnalyzer()
 26        self._lemmatizer = WordNetLemmatizer()
 27
 28        self._tokenizer = TweetTokenizer(preserve_case=False,
 29                                        strip_handles=True,
 30                                        reduce_len=True)
 31        
 32        self._TAGMAP = {'V' : 'v', 'J' : 'a', 'N' : 'n', 'R' : 'r' }
 33
 34        self._STOPWORDS = set(stopwords.words("english"))
 35            
 36    def _preprocess(self, tweet):
 37        """
 38        Cleans data before sentiment analysis, including removing stopwords and alphas.
 39        Args:
 40            tweet(str): tweet data of string
 41        Returns:
 42            data (list(str)): data that is vectorised and cleaned
 43        """
 44        data = self._tokenizer.tokenize(tweet)
 45
 46        data = [token for token in data if token.isalpha() and token not in self._STOPWORDS]
 47
 48        # (low) TD: Pull request pos tag (tagset) to work with lemmatize
 49        data = pos_tag(data)
 50    
 51        # (low) TD: refact if data + lemmatize + pos_tag
 52        
 53        data = [self._lemmatizer.lemmatize(token, self._TAGMAP.get(pos, 'n')) for token, pos in data]
 54        
 55        return data
 56    
 57    def _features(self, tweet):
 58        """
 59        Calcs the VADER score (from lexicon data)
 60        Args:
 61            tweet(str): tweet data of string
 62        Returns:
 63            features(dict): dictionary of pos and comp score as keys, to 0-1 float.
 64        """
 65        data = self._preprocess(tweet)
 66        
 67        if not data:
 68            return {}
 69
 70        features = {}
 71        positive_scores = []
 72        compound_scores = []
 73
 74        for word in data:
 75            positive_scores.append(self._sia.polarity_scores(word)["pos"])
 76            compound_scores.append(self._sia.polarity_scores(word)["compound"])
 77        features['pos_score'] = mean(positive_scores)
 78        features['comp_score'] = mean(compound_scores)
 79        return features
 80    
 81    def _trainmodel(self):
 82        """
 83        Using NLP's NaiveBayes probablity classifier, use labeled data to build 
 84        a pkl file.
 85        """
 86        features = []
 87        for tweet in self.pos_data:
 88            features.append((self._features(tweet), 'p'))
 89
 90        for tweet in self.neg_data:
 91            features.append((self._features(tweet), 'n'))
 92
 93        shuffle(features)
 94        classifier = NaiveBayesClassifier.train(labeled_featuresets=features[:1500])
 95        joblib.dump(classifier, r'backend\ML\models\NaiveBayes.pkl')
 96
 97    async def predict(self, tweet):
 98        """
 99        By using the trained pkl file in models, classify a single tweet
100        Args:
101            tweet (str) : text data inside of tweet.
102        Returns:
103            'p' to indicate positive and 'n' to indicate negative
104        """
105        print(tweet)
106        tweet = self._features(tweet)
107        result = joblib.load(r'backend\ML\models\NaiveBayes.pkl').classify(tweet)
108        return result

NaiveBayes() View Source

19    def __init__(self) -> None:
20        """"
21        Builds Vectoriser and stopwords object to be used in  _preprocess.
22        """
23        super().__init__()
24        # TD: should try to make stopwords static, and data needed for objects static also.
25        self._sia = SentimentIntensityAnalyzer()
26        self._lemmatizer = WordNetLemmatizer()
27
28        self._tokenizer = TweetTokenizer(preserve_case=False,
29                                        strip_handles=True,
30                                        reduce_len=True)
31        
32        self._TAGMAP = {'V' : 'v', 'J' : 'a', 'N' : 'n', 'R' : 'r' }
33
34        self._STOPWORDS = set(stopwords.words("english"))

" Builds Vectoriser and stopwords object to be used in _preprocess.

async def predict(self, tweet): View Source

 97    async def predict(self, tweet):
 98        """
 99        By using the trained pkl file in models, classify a single tweet
100        Args:
101            tweet (str) : text data inside of tweet.
102        Returns:
103            'p' to indicate positive and 'n' to indicate negative
104        """
105        print(tweet)
106        tweet = self._features(tweet)
107        result = joblib.load(r'backend\ML\models\NaiveBayes.pkl').classify(tweet)
108        return result

By using the trained pkl file in models, classify a single tweet Args: tweet (str) : text data inside of tweet. Returns: 'p' to indicate positive and 'n' to indicate negative

Inherited Members

ML.model.Model: pos_data; neg_data