Monday, 13 January 2014

Word frequencies

##################################################
#  Looking briefly at word frequencies
#
#
#  JHPS
#  C:  13/01/2014
#
##################################################

import os,string
from collections import Counter
from pylab import *

#### Overall word usage

## Open files in dict

local_files = os.listdir('.')
posts = {}

for file in local_files:
    if '.txt' in file:
      posts[file] = open(file,'r')

## pull all the words out

exclude = set(string.punctuation)
words = []

for post_key in posts:
  post = posts[post_key]
  for line in post:
    wds_ls = line.lower().strip('\n').split(' ')
    wds_ls = [''.join(ch for ch in e if ch not in exclude) for e in wds_ls]
    words += wds_ls

## Total word count is:
print len(words)

## Counter word frequencies
print Counter(words)

###  Looking the top, keywords over time
## Open files in dict

local_files = os.listdir('.')
posts = {}

for file in local_files:
    if '.txt' in file:
      posts[file] = open(file,'r')

## key words
data = [] 
technology = []
information = []
environment = []
system = []
internet = []
example = []
things = []
post_cnt = []
time = []
computational = []

for post_key in posts:
  post = posts[post_key]

  post_words = []
   
  for line in post:
    wds_ls = line.lower().strip('\n').split(' ')
    wds_ls = [''.join(ch for ch in e if ch not in exclude) for e in wds_ls]
    post_words += wds_ls

  data_count = 0

  counted = Counter(post_words)
 
  data.append(counted['data'])
  technology.append(counted['technology'])
  information.append(counted['information'])
  environment.append(counted['environment']+counted['environmental'])
  system.append(counted['system']+counted['systems'])
  internet.append(counted['internet'])
  example.append(counted['example'])
  things.append(counted['things'])
  post_cnt.append(counted['post'])
  time.append(counted['time']) 
  computational.append(counted['computational']) 

## Plotting these

fig = plt.figure(figsize=(15, 7))

ax1 = fig.add_subplot(1,2,1)
plot(data,label='Data')
plot(technology,label='Technology')
plot(information,label='Information')
plot(environment,label='Environment/Environmental')
plot(system,label='System')
plot(internet,label='Internet')

xticks(range(13),range(1,14))
xlabel('Posts')
ylabel('Word frequency')

ax1.legend(bbox_to_anchor=(0.4, 0, 0, 0), bbox_transform=gcf().transFigure)

ax2 = fig.add_subplot(1,2,2,sharey=ax1)

plot(example,label='Example')
plot(things,label='Things')
plot(post_cnt,label='Post')
plot(time,label='Time')
plot(computational,label='Computational')

xticks(range(13),range(1,14))
xlabel('Posts')

ax2.legend(bbox_to_anchor=(0.8, 0, 0, 0), bbox_transform=gcf().transFigure)

show()