First, we open up news.json (continue from News Article Analysis 1.0).
Recall that all_articles structure as below:
all_articles = [[date, title, content, link],[date, ..., ..., ...],....,....]
# Open JSON file
import json
with open('news.json') as f:
all_articles = json.load(f)
Regular Expressiong (re) - use for search and matching.
import Counter - to count word frequency
stopwords - use for remove stopword in content of articles
text = '' - store every word in text, as single string
tokens - word tokenization (exclude stopwords & numbers)
len(tokens) - show total number of words
Counter(tokens).most_common(100) - show 100 most common words
import re
from collections import Counter
stopwords = open('stopwords.txt','r').read().splitlines()
text = ''
for i in range(len(all_articles)):
try:
text += all_articles[i][2] # all content of articles become a single string
except TypeError:
continue
tokens = re.findall(r'\w+', text) # Regular Expresssion will return a list of tuples
tokens = [t.lower() for t in tokens if t.lower() not in stopwords] # remove stopwords
# to remove all numbers (eg. 2018, 1, 10, 100)
for w in tokens:
try: intW = int(w)
except ValueError: continue
if type(intW) == int:
while w in tokens: tokens.remove(w)
print('\n' + 'Total number of words = ' + str(len(tokens)) + '\n')
print(Counter(tokens).most_common(100))
filter_word - store all word intended to be remove by manually filling in.
Count again total number of words, and show 100 most common.
# Exclude some word you find not meaningful
print(r'Key in word you wish to exlucde from list')
filter_word = input()
filter_word = filter_word.split()
for word in filter_word:
while word in tokens: tokens.remove(word)
print('\n' + 'Total number of words = ' + str(len(tokens)) + '\n')
print(Counter(tokens).most_common(100))
Refer to result above, exclude more non-meaningful word.
# Refer to result above exclude more word you find not meaningful
print(r'Key in word you wish to exlucde from list')
filter_word = input()
filter_word = filter_word.split()
for word in filter_word:
while word in tokens: tokens.remove(word)
print('\n' + 'Total number of words = ' + str(len(tokens)) + '\n')
print(Counter(tokens).most_common(100))
Use Word Cloud and Matplotlib to visualize frequency of words.
from wordcloud import WordCloud
import matplotlib.pyplot as plt
all_words = ' '.join(tokens)
wordcloud = WordCloud(background_color="white", width=1600, height=800).generate(all_words)
# Open a plot of the generated image.
plt.figure( figsize=(16,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()