For this project we are making a classification model for "trainingset.csv" which contains 1557 news articles from 5 different categories: ['business', 'entertainment', 'politics', 'sport', 'tech']. The aim is to apply the learned model to get the predicted labels for "testdata.csv", a dataset of unlabeled articles.
#Import relevant libraries
import matplotlib
import matplotlib.pyplot as plt
import nltk
import numpy as np
import operator
import pandas as pd
import scikitplot as skplt
import string
import time
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn import metrics
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import label_binarize
from wordcloud import WordCloud
In this section we will load and clean the raw csv file.
Here, we load the training set file. We also used the seperator delimiter to create a category column allowing us to to see which category each article falls into.
raw_trainset = pd.read_csv('trainingset.csv',sep='^',header=0)
raw_trainset.head()
Reasons to check this:
if((len(raw_trainset[raw_trainset['content'] == None])) > 0):
print("Content Missing:")
print(raw_trainset[raw_trainset['content'] == None])
else:
print("All content present")
if((len(raw_trainset[raw_trainset['category'] == None])) > 0):
print("Category Data Missing:")
print(raw_trainset[raw_trainset['category'] == None])
else:
print("All categories present")
There are no fields missing, so do not have to perform any additional steps here.
The outline of the questions states that there should only be 5 categories; bsuiness, entertainment, politics, sport and tech. This section will check if there are any categories that should not be in our dataset.
print(raw_trainset.groupby("category").size())
There are only 5 categories here and no outliers, so we di not need to perform any additional steps.
It is important to eliminate duplicates so as not to skew the result of the analysis. We are going to check 2 things here:
If there are conflicting labels, it will confuse the classifier.
#check if any of the articles are the same
raw_trainset['duplicateContent'] = raw_trainset.duplicated(subset='content', keep=False)
#check if there are ROWS the same - ie the article AND category on a row matches another row
raw_trainset['duplicateRow'] = raw_trainset.duplicated(subset=None, keep=False)
# If the num duplicate rows = num duplicate categories, i.e no conflicting data
#(no articles that are listed twice have different categories)
if(len(raw_trainset[raw_trainset['duplicateRow'] == True]) == len(raw_trainset[raw_trainset['duplicateContent'] == True])):
print("Category is the same for duplicate content (no conflicting data)")
else:
print("Conflicting Data")
#make a new training set
new_trainset = raw_trainset.drop_duplicates(subset='content', keep='first', inplace=False).filter(['content', 'category'])
#show how many duplicates were removed
print("\nOld trainset length: " + str(len(raw_trainset)))
print("New trainset length: " + str(len(new_trainset)))
Tokenization is the act of breaking up a sequence of strings into pieces such as words, keywords, phrases, symbols and other elements called tokens. In the process of tokenization, some characters like punctuation marks are discarded.
wordExp = r'\w+' # this pattern finds all the words
tokenizer = RegexpTokenizer(wordExp)
tokensList = [] # this is a list of lists. Each list contains the tokens of a document.
for content in new_trainset['content']:
token_words = tokenizer.tokenize(content) #tokenize all words in the document
tokensList.append(token_words) #add this list to tokensList
We now have a lists of lists, where each list contains the tokens for each article in order. Below shows what the list looks like.
print(tokensList[:2])
def decapitalise(list_of_tokenLists):
newTokenList = [] #this will store the new list
for tokens in list_of_tokenLists: #for each list in the big list
decapitalised = [] #this will store a list of decapitalised tokens from a single doc
for word in tokens: #for each word in the list
if not word.isupper(): #don't decapitalise if the whole word is in uppercase anyway - avoids decapitalising acronyms like US, LA, ID
decapitalised.append(word.lower()) # change to lower case
else:
decapitalised.append(word)
newTokenList.append(decapitalised) #add decapitalised list to overall list
return newTokenList
def remove_stopwords(list_of_tokenLists):
stopword_list = stopwords.words('english') #impotr default nltk stopwords
newTokenList = [] #this will store the new list
for tokens in list_of_tokenLists: #for each list in the big list
notStop = [] #this will store a list of non-stopword tokens from a single doc
for word in tokens: #for each word in the list
if word not in stopword_list: #if word is not a stopword, append it
notStop.append(word)
newTokenList.append(notStop) #append newlist to the overall list
return newTokenList
def remove_numbers(list_of_tokenLists):
newTokenList = [] #this will store the new list
for tokens in list_of_tokenLists: #for each list in the big list
noNumbers= [] #this will store a list of tokens from a single doc
for word in tokens: #for each word in the list#
if not word.isdigit():
noNumbers.append(word) # remove number
newTokenList.append(noNumbers)
return newTokenList
#expandContractions
def expandContractions(inputList):
contractions = {
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": " what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
}
#for every word in the input text
for list_of_tokens in inputList:
for word in list_of_tokens:
# if the word is in our dictionary replace it with the expanded version.
if (word.lower() in contractions):
inputList = inputList.replace(word, contractions[word.lower()])
#if the word contains a hyphen, we'll replace the hyphen with a space leaving two words
if ("-" in word):
inputList = inputList.replace(word, word.replace("-", " "))
return (inputList)
def punctuation(list_of_tokenLists):
newTokenList = [] #this will store the new list
for tokens in list_of_tokenLists: #for each list in the big list
depunctuated = [] #this will store a list of tokens from a single doc
for word in tokens: #for each word in the list
depunctuated.append(word.translate(string.punctuation)) # remove punctuation
newTokenList.append(depunctuated)
return newTokenList
This ensures all variants of words will be counted the same, e.g "cats" and "cat" will be counted together.
def lemmatize(list_of_tokenLists):
newTokenList = [] #this will store the new list
wordnet_lemmatizer = WordNetLemmatizer()
for tokens in list_of_tokenLists:
tempLemmatized = []
for word in tokens:
tempLemmatized.append(wordnet_lemmatizer.lemmatize(word))
newTokenList.append(tempLemmatized)
return newTokenList
This step will perform the cleaning function and show the difference between our old tokens and new tokens.
def clean(list_of_tokenLists):
list_of_tokenLists = expandContractions(list_of_tokenLists) #expand contractions
list_of_tokenLists = decapitalise(list_of_tokenLists) #decapitalise
list_of_tokenLists = remove_stopwords(list_of_tokenLists) #remove stopwords
list_of_tokenLists = punctuation(list_of_tokenLists) #remove punctuation
list_of_tokenLists = remove_numbers(list_of_tokenLists) #remove numbers
list_of_tokenLists = lemmatize(list_of_tokenLists)
return list_of_tokenLists
print("Tokens Before: ")
print(tokensList[0][:50])
print("------------")
tokensList = clean(tokensList) # clean
print("Tokens After: ")
print(tokensList[0][:50])
We see the cleaning function was successful, as both lists are different.
new_trainset['tokens'] = tokensList
new_trainset.head()
tokenContentList = []
for i in tokensList:
tokenContentList.append(" ".join(i))
new_trainset['train_clean_content'] = tokenContentList
new_trainset.head()
In this section we will investigate our clean dataframe, and perform tokenization on the data.
This section will help us understand our dataframe at a high level.
new_trainset.groupby("category").size().plot(kind='bar')
This chart shows how many articles of each category we have in our training set. It is quite a balanced dataset, but we have more sport and business examples than entertainment examples.
Here, we will see the average length of content for each category.
wordDict = {} #this will be used to plot the results later
catGroups = new_trainset.groupby('category') #group by category
groupsList = catGroups.groups.keys() #get names of categories in list form
for group in groupsList: #for each category name
df = new_trainset[new_trainset['category'] == group] #slice the dataframe to get only this category
length_group = len(df) #length of this category for calculation
total_words = 0 #reset word value
for content in df['content']: #get 1 article from this category
words = content.split() #split into individual words
total_words = total_words + len(words) #add number of words to master list
wordDict[group] = total_words/length_group #record the average length in the dict
print("Average words in " + str(group) + " articles : " + str(total_words/length_group)) #print the result
This is a better way to vizualise the average length of each category's articles.
plt.bar(range(len(wordDict)), list(wordDict.values()), align='center') #make a bar chart
plt.xticks(range(len(wordDict)), list(wordDict.keys())) #assign labels
We can see from this chart that politics and tech articles tend to be the longest, so these categories will have the most content to learn from.
Here, we will analyse our newly cleaned article content.
masterWordList = []
for content in new_trainset['train_clean_content']:
masterWordList.append(content) #add each content to list
text = ' '.join(masterWordList) #join all of the corpus words
def wordcloud(words):
wordcloud = WordCloud(width=1600, height=800).generate(text)
plt.figure(figsize=(20,10), facecolor='k') #adjust size and color
plt.imshow(wordcloud) #show wordcloud on plot
plt.tight_layout(pad=0) #tight layout
plt.axis("off") #dont show axis
plt.show() #show visualisation
wordcloud(text)
#plt.savefig('img_wordcloud.png')
We can see in our word cloud that many different topics are represented in our corpus. We see topics like 'technology' being mentioned by name, 'country' which probably refers to Politics, 'film' which most likely corresponds with Entertainment.
However we also see a lot of the most frequent words are probably ones used across all topics, such as 'said', 'people', 'one'.
for group in groupsList: #for each category name
df = new_trainset[new_trainset['category'] == group] #slice the dataframe to get only this category
masterContentGroupList = []
for content in df['train_clean_content']: #get 1 article from this category
masterContentGroupList.append(content)
text = ' '.join(masterContentGroupList) #join all of the group words
print(str(group) + ":")
wordcloud(text)
When we split wordclouds by document topic, a clear picture is painted of each topic.
In the intial wordcloud of the whole corpus, many of the most frequent words did not seem to be topic specific. However, in the case of individual classes a lot of the top words seem much more domain specific. (Although 'said' seems to be a frequent word in any topic.) It would be a very simple task for a person to identify which topics these wordclouds belong to without prior knowledge of our data.
text = ' '.join(masterWordList)
words = text.split()
word_frequency = nltk.FreqDist(words)
word_frequency.plot(10, title = 'Top 10 Most Common Words')
Looking at the most 10 most common words in our corpus, it wouldn't be immediately obvious what sort of topics it might contain. This reflects what we saw in our previous wordclouds. The top words in the corpus tend to be those that are common to all articles and are not topic specific.
for group in groupsList: #for each category name
df = new_trainset[new_trainset['category'] == group] #slice the dataframe to get only this category
masterContentGroupList = []
for content in df['train_clean_content']: #get 1 article from this category
masterContentGroupList.append(content)
text = ' '.join(masterContentGroupList) #join all of the group words
words = text.split()
word_frequency = nltk.FreqDist(words)
title = 'Top 10 Most Common Words: ' + group
word_frequency.plot(10, title = title)
When we consider the 10 most commond words by category, a different picture is revealed.
While we still see some of the words common across all topics, the top 10 words easily provide enough information to be classified by humans. There are clearly distinct, topic specific words occurring frequently for each class.
text = ' '.join(masterWordList)
words = text.split()
#bigrams
n = 2
bigrams = list(ngrams(words, n))
bigram_frequency = nltk.FreqDist(bigrams)
print("5-top most common bigram:")
for i in list(bigram_frequency.most_common(5)):
print(i)
print("\n-------------\n")
#trigrams
n = 3
trigrams = list(ngrams(words, n))
trigram_frequency = nltk.FreqDist(trigrams)
print("5-top most common trigram:")
for i in list(trigram_frequency.most_common(5)):
print(i)
In reviewing the most common bi-grams and tri-grams, we are once again shown some common non-top specific phrases such as 'last year', 'I think', 'told BBC'.
However we do see 'Leader Michael Howard', and 'Mr. Kilroy Silk' appear several times which are both in reference to British politicians. So it appears that politics may be more disproportianately represented by bi-grams and tri-grams.
Below, we find all the noun tokens in the corpus and add it to the dataframe
nounList = []
for i in range(len(new_trainset)):
tokens = new_trainset.iloc[i]['tokens']
is_noun = lambda pos: pos[:2] == 'NN'
nouns = [word for (word, pos) in nltk.pos_tag(tokens) if is_noun(pos)]
nounList.append(nouns)
new_trainset['noun_tokens'] = nounList
Now, we investigate the most common nouns in the corpus.
masterNouns = []
for noun_list in new_trainset['noun_tokens']:
for noun in noun_list:
masterNouns.append(noun)
word_frequency = nltk.FreqDist(masterNouns)
word_frequency.plot(10, title = 'Top 10 Most Common Nouns')
In viewing the top 10 nouns in our corpus. A clearer picture is painted of the topics that may be present than when we considered the top 10 words.
We see words such as 'game', 'government', 'film', 'company', which give some insight into what may be discussed in the articles.
for group in groupsList: #for each category name
df = new_trainset[new_trainset['category'] == group] #slice the dataframe to get only this category
masterNounGroupList = []
for content in df['noun_tokens']: #get 1 article from this category
for noun in content:
masterNounGroupList.append(noun)
word_frequency = nltk.FreqDist(masterNounGroupList)
title = 'Top 10 Most Common Nouns: ' + group
word_frequency.plot(10, title = title)
Once again as we split our noun counts by topic, a unique picture is formed of each one. Each of the top 10 nouns for each topic seem more distinct from one another than when counting top 10 words.
As a final processing step after analysis, we will convert categories to numerical values for classification.
#print old headings for comparison
print(new_trainset['category'].head())
print("----------------------------")
#replace with new labels
labels = {"business": 0, "entertainment": 1, "politics": 2, "sport": 3, "tech": 4} #corresponding new values
new_trainset = new_trainset.replace(to_replace=labels, value=None)
#new labels
print(new_trainset['category'].head())
This takes our newly constructed dataframe and converts the values to numerical vectors for further analysis.
For this section, our team decided to consider numerous approaches for numerical vectorization. We researched each one in detail and explain the methods here. Each subsection will explore a different method.
Count Vector is a matrix notation of the dataset in which;
Firstly, we'll take the noun tokens column which we extracted from our cleaned data.
We also researched using all cleaned tokens (not just nouns), ultimately the performance was better for the noun tokens. We opted to only keep the code in relation to 'noun_tokens' so our code was not overly verbose and difficult to evaluate.
documents = new_trainset['noun_tokens'].apply(lambda x: (' ').join(x))
print(documents.head())
Next, we will use the Scikit-learn 'CountVectorizer' to transform 'documents' into a document-term matrix. Our input, 'documents', is a list of strings where each string is a separate document.
Our output, A, is a sparse NumPy 2D array with rows corresponding to documents and columns corresponding to terms.
count_vectorizer = CountVectorizer()
A = count_vectorizer.fit_transform(documents) # num_occurrences of a word in each document
print(A)
print('Size of train data (count): ' + str(A.shape))
terms = count_vectorizer.get_feature_names() # list of all terms
len(terms)
Above, we see each unique word/term from the texts and the column number of that word .
Below, we convert our vector to a dataframe.
count_vect_df = pd.DataFrame(A.todense(), columns=terms)
count_vect_df.head()
We initially passed 'ngram_range' = (1,2) to the TFidfVectorizer constructor to investigate how using n-grams would influence our accuracy. It gave us an increased performance in the case of Naives-Bayes of about 0.007%. However, the running time was greatly affected when we were performing Hyperparamter Optimization.
tfidf_vectorizer = TfidfVectorizer() # declare vectorizer object, set stopwords
# Create sparse NumPy array where the entries are all TF-IDF normalised
tfidf = tfidf_vectorizer.fit_transform(documents)
print(tfidf)
## For printing that tf-idf matrix, we convert it into dataframe
df_tfidf = pd.DataFrame(tfidf.toarray(),columns=[tfidf_vectorizer.get_feature_names()])
df_tfidf.head(10)
print('Size of train data (tfidf): ' + str(tfidf.shape))
Above we did the following:
count_vectorizer = CountVectorizer(min_df=20,max_df=1000)
A = count_vectorizer.fit_transform(documents)
terms = count_vectorizer.get_feature_names()
num_topics = 5
lda_model = LatentDirichletAllocation(n_components=num_topics)
W_lda = lda_model.fit_transform( A )
H_lda = lda_model.components_
# To display the document topics
top_terms = 10
for topic_index, topic in enumerate(H_lda):
print("Topic %d:" % (topic_index))
print(" ".join([count_vectorizer.get_feature_names()[i]
for i in topic.argsort()[:-top_terms - 1:-1]]))
Here are the top 10 words for each topic using LDA on count vectorization
num_topics = 5
nmf_model = NMF(n_components=num_topics, init="nndsvd") # use randominitialization
W_nmf = nmf_model.fit_transform( tfidf ) # W = matrix that contains the topics discovered from the documents
H_nmf = nmf_model.components_ # H = coefficient matrix containing the membership weights for the topics in each document
# To display the document topics
top_terms = 10
for topic_index, topic in enumerate(H_nmf):
print("Topic %d:" % (topic_index))
print(" ".join([tfidf_vectorizer.get_feature_names()[i]
for i in topic.argsort()[:-top_terms - 1:-1]]))
From this we can see that NMF gives better topics, so we will convert this to a dataframe.
nmf_df = pd.DataFrame(W_nmf)
This is where we train classifiers using features we have created.
First, this is the k fold function we are going to use to analyse our classifiers.
The function will perform 10-fold validation, we chose k = 10 which we felt was an adequate amount of splits while still falling within our time constraints. The function returns the accuracy of the classifier which we felt was a suitable measure of performance because our training data is reasonably balanced.
def kFold(dataframe,classifier):
scoring = 'f1_macro'
scores = cross_validate(classifier, dataframe, new_trainset.category, scoring = ['f1_macro'], cv=10)
f1_score = np.mean(scores['test_f1_macro'])
fit_time = np.mean(scores['fit_time'])
score_time = np.mean(scores['score_time'])
y_pred = cross_val_predict(classifier, dataframe, new_trainset.category, cv=10)
y_pred_proba = cross_val_predict(classifier, dataframe, new_trainset.category, cv=10, method = 'predict_proba')
cm = confusion_matrix(new_trainset.category, y_pred)
print('Accuracy:\t', accuracy_score(new_trainset.category, y_pred))
print("F1 score:\t", f1_score)
print("Fit Time:\t", fit_time)
print("Score Time:\t", score_time)
plot_confusion_matrix(cm)
plot_roc_curve(new_trainset.category, y_pred_proba)
Next we want to define a function to plot the confusion matrix.
def plot_confusion_matrix(cm, target_names=['Business', 'Entertainment', 'Politics', 'Sport', 'Tech'],
title='Confusion Matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=90)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
width, height = cm.shape
for x in range(width):
for y in range(height):
plt.annotate(str(cm[x][y]), xy=(y, x),
horizontalalignment='center',
verticalalignment='center')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
Finally, we define a function to output the ROC Curve.
A ROC (Receiver Operating Characteristic) Curve is used to show the performance of a classification model at all classification thresholds.
def plot_roc_curve(actual_labels, predicted_probas):
skplt.metrics.plot_roc(actual_labels, predicted_probas, plot_micro=False, plot_macro=False,
title='ROC Curve', figsize=(5,5))
plt.show()
cm_bayes_tfidf = kFold(df_tfidf, MultinomialNB())
cm_bayes_word_vectors = kFold(count_vect_df, MultinomialNB())
cm_bayes_nmf = kFold(nmf_df, MultinomialNB())
print('The searching range is: ' + str(np.linspace(0,1,11)))
classifier = MultinomialNB()
parameters = {'alpha': (np.linspace(0.01,1,11))}
gridSearchNB = GridSearchCV(classifier, parameters, cv=10,scoring='accuracy')
gridSearchNB2 = GridSearchCV(classifier, parameters, cv=10,scoring='accuracy')
gridSearchNB3 = GridSearchCV(classifier, parameters, cv=10,scoring='accuracy')
gridSearchNB_tfidf = gridSearchNB.fit(df_tfidf, new_trainset['category'])
gridSearchNB_wv = gridSearchNB2.fit(count_vect_df, new_trainset['category'])
gridSearchNB_nmf = gridSearchNB3.fit(nmf_df, new_trainset['category'])
NB_tfidf_parameters = gridSearchNB_tfidf.best_params_
NB_wv_parameters = gridSearchNB_wv.best_params_
NB_nmf_parameters = gridSearchNB_nmf.best_params_
print('The best tfidf parameter is: ', NB_tfidf_parameters['alpha'])
print('The best word vector parameter is: ', NB_wv_parameters['alpha'])
print('The best NMF parameter is: ', NB_nmf_parameters['alpha'])
cm_bayes_tfidf_ho = kFold(df_tfidf, MultinomialNB(alpha=NB_tfidf_parameters['alpha']))
cm_bayes_word_vectors_ho = kFold(count_vect_df, MultinomialNB(alpha=NB_wv_parameters['alpha']))
cm_bayes_nmf_ho = kFold(nmf_df, MultinomialNB(alpha=NB_nmf_parameters['alpha']))
Result: Naive Bayes with Word Vectorization and hyperparameter optimisation has the best accuracy.
We have set k = 3 for now. Hyperparameter Optimisation will be done with k set to 1, 3 and 5.
cm_knn_tfidf = kFold(df_tfidf, KNeighborsClassifier(n_neighbors=3))
cm_knn_word_vectors = kFold(count_vect_df, KNeighborsClassifier(n_neighbors=3))
cm_knn_nmf = kFold(nmf_df, KNeighborsClassifier(n_neighbors=3))
This code takes up to 20 minutes to run, depite cross validation being set to only 2, and only 3 k values being tested. This code has been commented out, but the results it produces have been provided below.
# k_range = [1,3,5]
# param_grid = dict(n_neighbors=k_range)
# knn = KNeighborsClassifier()
# grid_tfidf = GridSearchCV(knn, param_grid, cv=2, scoring='accuracy')
# grid_wv = GridSearchCV(knn, param_grid, cv=2, scoring='accuracy')
# grid_nmf = GridSearchCV(knn, param_grid, cv=2, scoring='accuracy')
# grid_tfidf.fit(df_tfidf, new_trainset['category'])
# grid_wv.fit(count_vect_df, new_trainset['category'])
# grid_nmf.fit(nmf_df, new_trainset['category'])
# grid_tfidf_mean_scores = [result.mean_validation_score for result in grid_tfidf.grid_scores_]
# grid_wv_mean_scores = [result.mean_validation_score for result in grid_wv.grid_scores_]
# grid_nmf_mean_scores = [result.mean_validation_score for result in grid_nmf.grid_scores_]
The code below is to account for the fact the previous cell has been commented out; it is not needed otherwise. These are the scores produced by the hyperparamter optimisation algorithm.
grid_tfidf_mean_scores = [0.8887417218543047, 0.9119205298013245, 0.9264900662251656]
grid_wv_mean_scores = [0.7026490066225165, 0.6079470198675496, 0.5556291390728477]
grid_nmf_mean_scores = [0.9291390728476822, 0.93841059602649, 0.9417218543046357]
print(max(grid_tfidf_mean_scores))
print(max(grid_wv_mean_scores))
print(max(grid_nmf_mean_scores))
As we can see, none of the max scores for TF-IDF, Word Vectors, or NMF out-perform the most successful combination so far; Naive Bayes with Word Vectors.
We chose our optimized Naive Bayes with tf-idf as it performed the best.
Below we load our test data and ensure it is in the expected format.
raw_testset = pd.read_csv('testdata.csv',sep='^',header=0)
raw_testset.head()
Now, clean test data in the same way we cleaned the training data.
#make a new test set
new_testset = raw_testset.copy()
#### Tokenization
test_tokensList = [] # this is a list of lists. Each list contains the tokens of a document.
for content in new_testset['content']:
token_words = tokenizer.tokenize(content) #tokenize all words in the document
test_tokensList.append(token_words) #add this list to tokensList
#### Cleaning tokens
test_tokensList = clean(test_tokensList) # apply all cleaning
#### Add tokens to dataframe
new_testset['tokens'] = test_tokensList
#### Combine tokens into a string
clean_content_test_tokens = []
for listOfTestTokens in new_testset['tokens']:
clean_content_test_tokens.append(' '.join(listOfTestTokens))
#### Add cleaned content to dataframe
new_testset['test_clean_content'] = clean_content_test_tokens
new_testset.head()
test_count_vectorizer = CountVectorizer()
test_word_vec = count_vectorizer.transform(new_testset['test_clean_content'])
print(test_word_vec)
#=terms = test_count_vectorizer.get_feature_names()
#test_count_vect_df = pd.DataFrame(test_word_vec.todense(), columns=terms)
print(test_word_vec.shape,count_vect_df.shape)
modelbayes = MultinomialNB(alpha=NB_wv_parameters['alpha'])
modelbayes.fit(count_vect_df,new_trainset['category']) #fitted to trainset and its categories
predicted_labels_bayes = modelbayes.predict(test_word_vec)
final_results = pd.DataFrame() #empty dataframe to combine content and category
final_results_content = [] #empty list to store content
final_results_categories = [] #empty list to store category
_labels = ['business', 'entertainment', 'politics', 'sport', 'tech'] #labels for clarity
# assemble the lists for all the original raw dataset
for i in range(len(raw_testset)):
final_results_content.append(raw_testset['content'][i])
final_results_categories.append(_labels[np.int(predicted_labels_bayes[i])])
# add lists to final dataframe
final_results['content'] = final_results_content
final_results['predicted_category'] = final_results_categories
#print results
final_results
final_results.to_csv("final_results_CessnaSkyhawk.csv", index=False)
test_results_csv = pd.read_csv("final_results_CessnaSkyhawk.csv")
test_results_csv.head()