-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathindexing_app.py
executable file
·120 lines (91 loc) · 4.05 KB
/
indexing_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
This is the main python script for indexing local html documents into Elasticsearch.
This code is written in Python3 syntax.
See the README or execute with -h to see expected parameters
"""
import os, re
import globals, common, index_definitions
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup # library for parsing html
from bs4.element import Comment
es = Elasticsearch([globals.ES_HOST], http_auth=(globals.ES_USER, globals.ES_PASSWORD))
def extract_fields_from_html(html_body):
"""Receives html and removes styles and scripts
(could be easily modified to remove more if necessary).
Keyword arguments:
html_body -- the html that we will be cleaning up
"""
soup = BeautifulSoup(html_body, 'html.parser')
# Get the title from the html. However, some pages just redirect to others,
# and so might not have a title set
try:
title = soup.title.contents[0]
title = re.sub('\s+', ' ', title)
except:
title = ""
# Remove styles and scripts from the html for ingestion into the contents
[s.extract() for s in soup(['style', 'script'])]
visible_text = soup.getText()
# The following lines cleanup the text for better display when "View indexed content"
# is selected. This leaves in a single newline when multiple \n are encountered.
#
# Replace multiple spaces with a single space, but leave newlines in for now
visible_text = re.sub('[^\S\n]+', ' ', visible_text)
# Replace multiple sequential newlines with a single newline
visible_text = re.sub('\n+', '\n', visible_text)
return {
"title": title,
"content": visible_text
}
def walk_and_index_all_files(input_files_root, index_name):
"""Walks the directory tree starting at base_dir, and ingests each html document that
is encountered into an Elasticsearch index
Keyword arguments:
index_name -- name of the index that will be used
input_files_root -- the base directory which the html files reside in
"""
for root, dirs, files in os.walk(input_files_root):
for file in files:
if file.endswith(".html"):
rel_dir = os.path.relpath(root, input_files_root)
relative_path_to_file = os.path.join(rel_dir, file)
print("indexing %s from %s" % (index_name, relative_path_to_file))
abs_file_path = os.path.join(input_files_root, relative_path_to_file)
infile = open(abs_file_path)
html_from_file = infile.read()
json_to_index = extract_fields_from_html(html_from_file)
json_to_index['relative_path_to_file'] = relative_path_to_file
es.index(index=index_name, id=None,
body=json_to_index)
def configure_index(index_name):
"""Ensures that settings and mappings are defined on the Elasticsearch
index that we will write our documents into.
Keyword arguments:
index_name -- name of the index that will be used
"""
index_exists = es.indices.exists(index=index_name)
if index_exists:
print("Index: %s already exists. Would you like to delete, append, or abort" % index_name)
answer = input("Type one of 'overwrite', 'append' or 'abort': ")
if answer == "overwrite":
es.indices.delete(index=index_name, ignore=[400, 404])
index_exists = False
elif answer == "abort":
exit(0)
# If the index doesn't exist, then write settings/mappings
if not index_exists:
request_body = {
'settings': index_definitions.INDEX_SETTINGS,
'mappings': index_definitions.INDEX_MAPPINGS
}
es.indices.create(index=index_name, body=request_body)
def main():
"""Get the command line arguments, and start indexing documents into Elaseticsearch
"""
parsed_args = common.parse_arguments()
base_dir = parsed_args.path
index_name = parsed_args.index_name
configure_index(index_name)
walk_and_index_all_files(base_dir, index_name)
if __name__ == '__main__':
main()