##################################################
# Looking briefly at word frequencies
#
#
# JHPS
# C: 13/01/2014
#
##################################################
import os,string
from collections import Counter
from pylab import *
#### Overall word usage
## Open files in dict
local_files = os.listdir('.')
posts = {}
for file in local_files:
if '.txt' in file:
posts[file] = open(file,'r')
## pull all the words out
exclude = set(string.punctuation)
words = []
for post_key in posts:
post = posts[post_key]
for line in post:
wds_ls = line.lower().strip('\n').split(' ')
wds_ls = [''.join(ch for ch in e if ch not in exclude) for e in wds_ls]
words += wds_ls
## Total word count is:
print len(words)
## Counter word frequencies
print Counter(words)
### Looking the top, keywords over time
## Open files in dict
local_files = os.listdir('.')
posts = {}
for file in local_files:
if '.txt' in file:
posts[file] = open(file,'r')
## key words
data = []
technology = []
information = []
environment = []
system = []
internet = []
example = []
things = []
post_cnt = []
time = []
computational = []
for post_key in posts:
post = posts[post_key]
post_words = []
for line in post:
wds_ls = line.lower().strip('\n').split(' ')
wds_ls = [''.join(ch for ch in e if ch not in exclude) for e in wds_ls]
post_words += wds_ls
data_count = 0
counted = Counter(post_words)
data.append(counted['data'])
technology.append(counted['technology'])
information.append(counted['information'])
environment.append(counted['environment']+counted['environmental'])
system.append(counted['system']+counted['systems'])
internet.append(counted['internet'])
example.append(counted['example'])
things.append(counted['things'])
post_cnt.append(counted['post'])
time.append(counted['time'])
computational.append(counted['computational'])
## Plotting these
fig = plt.figure(figsize=(15, 7))
ax1 = fig.add_subplot(1,2,1)
plot(data,label='Data')
plot(technology,label='Technology')
plot(information,label='Information')
plot(environment,label='Environment/Environmental')
plot(system,label='System')
plot(internet,label='Internet')
xticks(range(13),range(1,14))
xlabel('Posts')
ylabel('Word frequency')
ax1.legend(bbox_to_anchor=(0.4, 0, 0, 0), bbox_transform=gcf().transFigure)
ax2 = fig.add_subplot(1,2,2,sharey=ax1)
plot(example,label='Example')
plot(things,label='Things')
plot(post_cnt,label='Post')
plot(time,label='Time')
plot(computational,label='Computational')
xticks(range(13),range(1,14))
xlabel('Posts')
ax2.legend(bbox_to_anchor=(0.8, 0, 0, 0), bbox_transform=gcf().transFigure)
show()
No comments:
Post a Comment