-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
114 lines (97 loc) · 3.95 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import json
from utils import CompressedTrie
from utils import fetch_htmls, extract_htmls, WordPageList
from utils import get_html_filename, get_html_tokenname, get_html_contentname
from utils import Configuration
from utils import pagerank
from utils.algorithm.pagerank import preprocess
from utils import extract_word
class PreprocessConfiguration(Configuration):
input_path = None
output_path = None
use_pagerank = True
use_tfidf = True
def __init__(self, config=None):
super().__init__(config)
def load_content(path, html_id):
filename = get_html_contentname(html_id)
path = os.path.join(path, filename)
return open(path, encoding='utf-8').read()
def dump_html_words(path, html_id, word_dict):
tokenname = get_html_tokenname(html_id)
path = os.path.join(path, tokenname)
with open(path, 'w', encoding='utf-8') as f:
json.dump(word_dict, f)
def standardize_bbc_link(urls, links):
domain = 'https://www.bbc.com'
alter_domain = 'https://www.bbc.co.uk'
result = []
for i, link in enumerate(links):
if link[0] == '/':
std_link = domain + link
elif link.startswith(alter_domain):
std_link = link.replace(alter_domain, domain)
if std_link in urls:
result.append(std_link)
return list(set(result))
if __name__ == '__main__':
preprocess_config = {
'input_path': 'data/input/urls',
'output_path': 'data/output/',
'use_pagerank': True
}
config = PreprocessConfiguration(preprocess_config)
output_path = config.output_path
if not os.path.exists(output_path):
os.mkdir(output_path)
html_path = os.path.join(output_path, 'htmls')
word_path = os.path.join(output_path, 'words')
trie_path = os.path.join(output_path, 'trie')
if not os.path.exists(html_path):
os.mkdir(html_path)
if not os.path.exists(word_path):
os.mkdir(word_path)
if not os.path.exists(trie_path):
os.mkdir(trie_path)
# calculate links relation (url and links that this url links to), like
# [
# url: [link1, link2, ...],
# ...
# ]
urls = [url.strip() for url in open(config.input_path, 'r').readlines()]
url_id_map, success_urls, fail_urls = fetch_htmls(urls, html_path)
links_relation = extract_htmls(success_urls, html_path)
for i, (url, links) in enumerate(links_relation):
links_relation[i] = [url, standardize_bbc_link(urls, links)]
trie = CompressedTrie()
# inserting words in each html to trie and store their frequencies and current url to the occurence list.
print('inserting words to trie...')
for nid, url in enumerate(success_urls):
print('processing %d / %d pages...' % (nid + 1, len(success_urls)))
content = load_content(html_path, nid)
word_dict, total_count = extract_word(content)
dump_html_words(html_path, nid, word_dict)
for word, count in word_dict.items():
value = trie.search(word)
if value is None:
id_ = trie.size()
value = WordPageList(id_)
value.new_page(nid, count / total_count, word_path)
trie.insert(word, value)
else:
value.append_page(nid, count / total_count, word_path)
# calculate page rank.
print('calculating page rank...')
if config.use_pagerank:
reversed_links_relation = preprocess(links_relation, url_id_map)
ranks = pagerank(reversed_links_relation)
# use page rank and frequency to sort the links
for value in trie.value_list():
page_list = value.load(word_path)
for word_page in page_list:
nid, count, score = word_page.page_id(), word_page.count(), word_page.score()
word_page.set_score(count * ranks[nid])
page_list.sort(key=lambda t: -t.score())
value.dump(page_list, word_path)
CompressedTrie.dump(trie, trie_path)