-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrie_tree.py
133 lines (122 loc) · 4.17 KB
/
trie_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
import time
import os
class TrieTree(object):
"""
Trie Tree by Python
well, you may use Python lib Pickle to write the tree to bin file
and load the file instead of create the tree every time
import pickle
"""
__author__ = 'Luyue'
__authorMail__ = '544625106@qq.com'
def __init__(self):
self.trie = {}
def add(self, word):
#Add a sensetive word
p = self.trie
word = word.strip()
for c in word:
if not c in p:
p[c] = {}
p = p[c]
if word != '':
p[''] = ''
return True
def remove(self, word):
#Remove a sensetive word ,return bool
if self.search(word):
slen = len(word)
tempTrie = self.trie
index = 0
mark = [0]
while index < slen:
if '' in tempTrie[word[index]]:
mark.append(index)
tempTrie = tempTrie[word[index]]
index = index + 1
if tempTrie == {'':''}:
if mark[-2] == 0:
del(self.trie[word[0]])
else:
eval('self.trie'+''.join(["['%s']" % x for x in word[:mark[-2]+1]])).pop(word[mark[-2]+1])
else:
eval('self.trie'+''.join(["['%s']" % x for x in word[:mark[-1]+1]])).pop('')
return True
def search(self, word):
#Check the Trie Tree Whecher has the sensetive word, return bool
p = self.trie
word = word.lstrip()
for c in word:
if not c in p:
return False
p = p[c]
if '' in p:
return True
return False
def check_string(self, string):
#Check the content whether has any sensetive word , return bool
if self.trie == {}:
return False
iLen = len(string)
i = 0
while i < iLen:
trie = self.trie
j = i
while (j < iLen and string[j] in trie):
if '' in trie[string[j]]:
return True
trie = trie[string[j]]
j = j + 1
i = i + 1
return False
def filter_string(self, string):
#Filter the content, return the string and the list of sensetive word
retstring = ''
filterWord = dict()
if self.trie == {}:
return string
iLen = len(string.lstrip())
i = 0
while i < iLen:
trie = self.trie
MaxMatchNum = 0
j = i
while (j< iLen and string[j] in trie):
if '' in trie[string[j]]:
MaxMatchNum = j
trie = trie[string[j]]
j = j + 1
if MaxMatchNum == 0:
retstring += string[i]
i = i + 1
else:
retstring += u'*' * (MaxMatchNum-i+1)
sensetiveWord = string[i:MaxMatchNum+1]
if sensetiveWord in filterWord:
filterWord[sensetiveWord] = filterWord[sensetiveWord] + 1
else:
filterWord[sensetiveWord] = 1
i = MaxMatchNum + 1
return retstring, filterWord
if __name__ == '__main__':
trie_obj = TrieTree()
filtermsg = 'here is the content that you want to filter the sensetive word like fuck fucked fucker';
a = time.time()
#Change the code to your local sensetive word source and Add word
fp = open(os.path.dirname(__file__) + '/word.txt' ,'r', encoding='UTF-8')
words = []
wordnum = 0;
for line in fp:
wordnum = wordnum + 1
trie_obj.add(line.strip("\n"))
fp.close()
b = time.time()
msg, filterWord = trie_obj.filter_string(filtermsg)
c = time.time()
print('敏感词数量: '+ str(wordnum))
print('文章总长度: '+ str(len(filtermsg)))
print('字典构建耗时:'+ str(b-a))
print('检索耗时: '+ str(c-b))
print(msg)
print(filterWord)