-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_obama_vec.py
executable file
·70 lines (52 loc) · 1.89 KB
/
make_obama_vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
import re
import glob
import logging
import nltk
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, word2vec
def remove_specialchars(text):
"""Remove unwanted special characters"""
out = re.sub("--", " ", text)
out = re.sub(r"\.\.\.", " ", out)
return out
def remove_xa0(text):
"""Remove weird hex texts"""
return text.replace("\xa0", " ")
def sentence_to_wordlist(sentence, remove_stopwords=False):
"""Converts sentence to list of words"""
sentence_text = re.sub(r'[^\w\s]', '', sentence)
words = sentence_text.lower().split()
return words
def speech_to_sentences(speech, tokenizer):
"""Converts speech to arrays of arrays of words"""
raw_sentences = tokenizer.tokenize(speech)
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(sentence_to_wordlist(raw_sentence))
return sentences
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
stemmer = LancasterStemmer()
sentences = []
sw = stopwords.words("english")
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
num_features = 100
min_word_count = 30
num_workers = 4
context = 4
downsampling = 1e-3
for filename in glob.glob("processed/*"):
with open(filename, encoding='utf-8') as f:
data = f.read().splitlines()[0]
data = remove_specialchars(data).strip()
data = remove_xa0(data).strip()
# removes multiple whitespaces
data = " ".join(data.split())
sentences += speech_to_sentences(data, tokenizer)
obama_vec = word2vec.Word2Vec(
sentences, workers=num_workers, size=num_features,
window=context, sample=downsampling)
acc = obama_vec.accuracy("/home/syafiq/data/questions-words.txt")
obama_vec.wv.save_word2vec_format("obama_vec.bin", binary=True)