-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathextractor.py
76 lines (64 loc) · 2.31 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Tool to extract sentences & words from a file.
from sys import argv
import parser
def get_sentences(file_name):
# Extract sentences from a text file.
reader = open(file_name)
sentences = reader.read()
reader.close()
sentences = sentences.replace("\n", "")
sentences = parser.convert_abbreviations(sentences)
sentences = sentences.replace("?", ".")
sentences = sentences.replace("!", ".")
sentences = sentences.split(".")
sentences = parser.fix_broken_sentences(sentences)
sentences = parser.remove_whitespace_list(sentences)
sentences = parser.remove_blanks(sentences)
sentences = parser.add_periods(sentences)
sentences = parser.clean_up_quotes(sentences)
sentences = parser.group_quotes(sentences)
sentences = parser.comma_handler(sentences)
return sentences
def get_words(file_name):
# Extract words from a text file. Clean the words by removing surrounding
# punctuation and whitespace, and convert the word to singular.
reader = open(file_name)
words = reader.read()
reader.close()
words = words.replace("\n", " ")
words = parser.convert_abbreviations(words)
words = words.split(" ")
words = parser.remove_blanks(words)
for i in range(0, len(words)):
words[i] = parser.clean(words[i])
return words
def print_usage():
# Print how to run the tool and use the parameters.
print('''
Usage:
extractor.py <article.txt> [parameter]
Parameters:
-i --info display basic info about <article.txt>
-s --sentences extract sentences from <article.txt>
-w --words extract words from <article.txt>
''')
def handle_arguments():
# Handle the command line arguments.
if argv[2] == "-i" or argv[2] == "--info":
print("Sentence count: %6d" % len(get_sentences(argv[1])))
print("Word count: %6d" % len(get_words(argv[1])))
elif argv[2] == "-s" or argv[2] == "--sentences":
sentences = get_sentences(argv[1])
for sentence in sentences:
print(sentence)
elif argv[2] == "-w" or argv[2] == "--words":
words = get_words(argv[1])
for word in words:
print(word)
else:
print_usage()
if __name__ == "__main__":
if len(argv) == 3:
handle_arguments()
else:
print_usage()