-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathccks_preprocess.py
50 lines (36 loc) · 1.39 KB
/
ccks_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
import codecs
import jieba
from ant_preprocess import stopwordslist
def _char_process(lines, stopwords):
sentences = []
for line in lines:
sent1, sent2, label = line.split('\t')
doc1 = [c for c in sent1 if c not in stopwords and c != ' ']
doc2 = [c for c in sent2 if c not in stopwords and c != ' ']
sentences.append([' '.join(doc1), ' '.join(doc2), str(int(label))])
return sentences
def _word_process(lines, stopwords):
sentences = []
for line in lines:
sent1, sent2, label = line.split('\t')
doc1 = [c for c in jieba.cut(sent1) if c not in stopwords and c != ' ']
doc2 = [c for c in jieba.cut(sent2) if c not in stopwords and c != ' ']
sentences.append([' '.join(doc1), ' '.join(doc2), str(int(label))])
return sentences
def _save_csv(lines, filename):
with open(filename, 'w') as f:
for line in lines:
f.write(','.join(line) + '\n')
def process(datapath="raw_data/task3_train.txt"):
stopwords = stopwordslist()
with codecs.open(datapath, 'r', 'utf-8') as f:
lines = f.readlines()
# word segment with char
sentences = _char_process(lines, stopwords)
_save_csv(sentences, 'data/ccks_char.csv')
# word segment with word
sentences = _word_process(lines, stopwords)
_save_csv(sentences, 'data/ccks_word.csv')
if __name__ == "__main__":
process()