-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data_Doc2Vec.py
86 lines (81 loc) · 2.54 KB
/
clean_data_Doc2Vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import numpy as np
database = pd.read_csv('simpsons_script_lines.csv').replace(np.nan, '', regex=True)
database = database.drop('id', 1)
database = database.drop('number', 1)
database = database.drop('raw_text', 1)
database = database.drop('timestamp_in_ms', 1)
database = database.drop('raw_character_text', 1)
database = database.drop('raw_location_text', 1)
database = database.drop('normalized_text', 1)
database = database.drop('word_count', 1)
database = database.values.tolist()
listQuestions = list()
listAnswers = list()
homer = 2
numberHomerAnswers = 0
for i in range(len(database)):
episode = database[i][0]
location = database[i][3]
characterQuestion = database[i-1][2]
characterAnswer = database[i][2]
if characterQuestion == '':
continue
if characterAnswer == '':
continue
if characterQuestion == characterAnswer:
continue
question = ''
j = i-1
while True:
if database[j][0] != episode:
break
if database[j][1] != True:
break
if database[j][2] != characterQuestion:
break
if database[j][3] != location:
break
if question == '':
question = database[j][4]
else:
question = database[j][4] + ' ' + question
j -= 1
j = i
answer = ''
while True:
if database[j][0] != episode:
break
if database[j][1] != True:
break
if database[j][2] != characterAnswer:
break
if database[j][3] != location:
break
if answer == '':
answer = database[j][4]
else:
answer = answer + ' ' + database[j][4]
j += 1
if j > len(database):
break
if len(question.strip()) == 0:
continue
if len(answer.strip()) == 0:
continue
if characterAnswer == homer:
numberHomerAnswers += 1
listQuestions = [question] + listQuestions
listAnswers = [answer] + listAnswers
else:
listQuestions = listQuestions + [question]
listAnswers = listAnswers + [answer]
fileQuestions = open('questions.txt', 'w', encoding='utf-8')
for question in listQuestions:
fileQuestions.write(question.lower() + '\n')
fileQuestions.close()
fileAnswers = open('answers.txt', 'w', encoding='utf-8')
for answer in listAnswers:
fileAnswers.write(answer.lower() + '\n')
fileAnswers.close()
print(numberHomerAnswers)