import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer as wnl
from nltk.corpus import stopwords, wordnet
from nltk.tag import pos_tag
from nltk import classify, NaiveBayesClassifier

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import re, random, pickle


# Classification Report

               precision    recall  f1-score   support

           0       0.67      0.73      0.70      1480
           4       0.71      0.65      0.68      1520

    accuracy                           0.69      3000
   macro avg       0.69      0.69      0.69      3000
weighted avg       0.69      0.69      0.69      3000


# Confusion Matrix

 [[1074  406]
 [ 531  989]]


"""
Returns a generator to get all tokens in a list of sentences
"""
def get_all_tokens(sentences):
    for sent in sentences:
        for token in sent:
            yield token

"""
Returns the corresponding wordnet tag for a parts of speach (pos) tag
"""
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

"""
Return dictionary for existing dataset to for model
"""
def sentence_to_dict(dataset):
    for tokens in dataset:
        yield dict([token, True] for token in tokens)


df = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')
df.columns = ['sentiment', 'id', 'date', 'flag', 'user', 'text']
df.drop(['id', 'date', 'flag', 'user'], axis=1, inplace=True) # Dont need these cols

positives = df.loc[df['sentiment'] == 4]
negatives = df.loc[df['sentiment'] == 0]

df = pd.concat([positives.head(5000), negatives.head(5000)])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 799999 to 4999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  10000 non-null  int64 
 1   text       10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


def preprocess_tweet(text, lang='english'):
    lemmatizer = wnl()
    stop_words = stopwords.words(lang)

    result = []
        
    for part, tag in pos_tag(text.split()):
        # Remove links
        part = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', part)

        # Remove handles
        part = re.sub('(@[A-Za-z0-9_]+)', '', part)

        part = part.lower().strip()

        part = part.replace(':)', 'smile')
        part = part.replace(':(', 'sad')
        part = part.replace(':/', 'frown')
        part = part.replace(';)', 'wink')
        part = part.replace(':D', 'big smile')
        part = part.replace(';D', 'big smile')
        part = part.replace(';d', 'big smile')

        # If it's a stopword we dont want to add to our token list
        if part in stop_words:
            continue

        wordnet_pos = get_wordnet_tag(tag)

        result.append(lemmatizer.lemmatize(part, wordnet_pos))
        
    return " ".join(result)

df['text'] = df['text'].apply(preprocess_tweet)


print(df.iloc[0][1])

love  u guy r best!!


# Shuffle
df = shuffle(df)

# Partition to train and test
df_train, df_test = train_test_split(df, test_size=0.3)


# Create the dict
def create_dataset(df):
    dataset = []
    for i, row in df.iterrows():
        tweet = row['text']
        sent = row['sentiment']
        text_dict = dict([word, True] for word in word_tokenize(tweet))
        dataset.append((text_dict, sent))
    return dataset
        
d_train = create_dataset(df_train)
d_test = create_dataset(df_test)


d_train[2]

({'good': True,
  'morning': True,
  'tom': True,
  'dougie': True,
  ',': True,
  'today': True,
  '?': True},
 4)


classifier = NaiveBayesClassifier.train(d_train)

print('Accuracy: {}'.format(classify.accuracy(classifier, d_test)))

# Save our model so we can reuse it later!
pickle.dump(classifier, open('bin/classifier.o', 'wb'))

Accuracy: 0.6876666666666666


classifier.classify({'How': True, 'are': True, 'you': True, '!': True})

4


classifier.classify({'Worst': True, 'neighbour': True, 'ever': True, 'bad': True, 'smell': True})

0


predictions = []
actual = []
for test in d_test:
    predictions.append(classifier.classify(test[0])) # 3000 predictions
    actual.append(test[1])


print('',classification_report(actual, predictions))

               precision    recall  f1-score   support

           0       0.67      0.73      0.70      1480
           4       0.71      0.65      0.68      1520

    accuracy                           0.69      3000
   macro avg       0.69      0.69      0.69      3000
weighted avg       0.69      0.69      0.69      3000


print('',confusion_matrix(actual, predictions))

 [[1074  406]
 [ 531  989]]

Training a Positive / Negative Text Classifiier¶

Setup¶

Summary¶

Preprocessing Functions¶

Loading Data¶

Preprocessing¶

Before Preprocessing¶

After Preprocessing¶

Preparing the dataset¶

Training¶

Results¶

Classification Report¶

Confusion Matrix¶

Future Improvements¶