-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocesswiki.py
125 lines (99 loc) · 3.79 KB
/
processwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# functions to convert wikipedia articles saved as
# (article_as_string, title)
# to
# (title, ({wordid : trainwordct}, {wordid: testwordct}))
# and save them back to disk
from __future__ import division
from collections import Counter, deque
from itertools import islice
from wikirandom import get_random_wikipedia_article
import cPickle as pickle
import re
import copy
def online_wiki_docs():
# This is slow, for illustration only
while True:
yield get_random_wikipedia_article()
def load_list(pkl_file, num_items=3300000):
with open(pkl_file,'rb') as f:
while num_items > 0:
try:
yield pickle.load(f)
num_items -= 1
except EOFError:
raise StopIteration
def clean_doc(doc, vocab):
""" given doc and user-defined vocab return list like ['you','me'] where the word can be found in vocab"""
doc = doc.lower()
doc = re.sub(r'-', ' ', doc)
doc = re.sub(r'[^a-z ]', '', doc)
doc = re.sub(r' +', ' ', doc)
return [word for word in doc.split() if word in vocab]
def parse_docs(name_docs, vocab):
""" generate a list of tuples like [ ('my novel',{...},{...}) , ('my novel',{...},{...}) ]"""
for doc, name in name_docs:
words = clean_doc(doc, vocab)
# Hold back every 10th word to get an online estimate of perplexity
# thus train_words takes the same form as words
train_words = [vocab[w] for (i, w) in enumerate(words) if i % 10 != 0]
test_words = [vocab[w] for (i, w) in enumerate(words) if i % 10 == 0]
# for a list ['1','1','2','3','4']
# Counter returns the dict like this {'1':2,'2':1,'3':1,'4':1}
# here the dict takes the form like this {3846:1,2393:1,8839:2} since the word is mapped into the vocab
train_cntr = Counter(train_words)
test_cntr = Counter(test_words)
yield (name, train_cntr, test_cntr)
def take_every(n, iterable):
""" returns the list like [ [('',{},{}),...] , [] ], the 2nd dimension list has n tuples,
thus each of them is a mini-batch"""
i = iter(iterable)
piece = list(islice(i, n))
while piece:
yield piece
piece = list(islice(i, n))
def group_up(n, N, iterable):
i = iter(iterable)
ctr = 0
while ctr < N:
piece = list(islice(i,n))
ctr += n
yield piece
def consume(iterator, n):
"Advance the iterator n-steps ahead. If n is none, consume entirely."
# Use functions that consume iterators at C speed.
if n is None:
# feed the entire iterator into a zero-length deque
deque(iterator, maxlen=0)
else:
# advance to the empty slice starting at position n
next(islice(iterator, n, n), None)
def save_data(source, out_file):
with open(out_file,'wb') as out:
for item in source:
pickle.dump(item, out, protocol=-1)
def get_test_set(docs):
"""input: [ ('novel',{1234:1,4567:1}) , (...) ]
output: [ ('novel',{},{}) ,(...) ] that is extract a test set for the heldout set"""
cnt = 0
out_docs = []
for name, doc in docs:
test = {}
doc_c = copy.deepcopy(doc)
for word in doc:
if cnt % 10 == 0:
test[word] = doc[word]
del doc_c[word]
cnt += 1
out_docs += [(name, doc_c, test)]
return out_docs
def create_vocab(vocab_file):
dictnostops = open(vocab_file).readlines()
vocab = dict()
# the dictnostops is a list like ['you','me']
# the enumerate returns a object containing the tuple like (1,'you') (2,'me')
# syntax like list(enumerate(a)) will create the list like [(1,'you'),(2,'me')]
for idx,word in enumerate(dictnostops):
word = word.lower()
word = re.sub(r'[^a-z]', '', word)
vocab[word] = idx
return vocab