-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathAR_reviewInstance.py
executable file
·134 lines (123 loc) · 3.68 KB
/
AR_reviewInstance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/python
#
# Define the review instances
#
import json
import numpy as np
# Class that contains the information for each review:
class Review:
# Constructor:
def __init__(self):
# the review id
self.id = -1
# the review content, as a list
self.content = []
# the review raw text, as a string
self.text = None
# rating of this review (1-5):
self.rating = -1
# number of tokens in the review:
self.ntokens = -1
# the time stamp:
self.ts = ""
# each group belong to
self.group = ""
# the probability
self.prob = 0
# the label for this review: 1-> informative; 0 -> unlabeled; -1 ->non-informative
self.label = 0
# doc vector (numpy vector):
self.vnp = None
# doc vector (dict vector):
self.vdict = None
# tf-idf vector:
self.tf_idf = None
# initialize from reading each review instance from the dataset
def fromText(self, id, content, ntokens, rating, label, raw_text):
self.id = id
self.content = content
self.text = raw_text
self.ntokens = ntokens
self.rating = rating
self.label = label
# initialize from the json dict
def fromJson(self, jsonDict):
self.id = jsonDict['id']
self.content = jsonDict['content']
self.text = jsonDict['text']
self.ntokens = jsonDict['ntokens']
self.rating = jsonDict['rating']
self.label = jsonDict['label']
self.ts = jsonDict['ts']
self.group = jsonDict['group']
self.prob = jsonDict['prob']
# change the review instance to a dictionary instance
def toDist(self):
reviewDict = {}
reviewDict['id'] = self.id
reviewDict['content'] = self.content
reviewDict['text'] = self.text
reviewDict['rating'] = self.rating
reviewDict['ntokens'] = self.ntokens
reviewDict['ts'] = self.ts
reviewDict['group'] = self.group
reviewDict['prob'] = self.prob
reviewDict['label'] = self.label
reviewDict['vnp'] = self.vnp
reviewDict['vdict'] = self.vdict
reviewDict['tf_idf'] = self.tf_idf
return reviewDict
# convert the review to the json text
def toJsonDict(self):
jsonDict = {}
jsonDict['id'] = self.id
jsonDict['content'] = self.content
jsonDict['text'] = self.text
jsonDict['rating'] = self.rating
jsonDict['ntokens'] = self.ntokens
jsonDict['ts'] = self.ts
jsonDict['group'] = self.group
jsonDict['prob'] = self.prob
jsonDict['label'] = self.label
return jsonDict
# Remove the terms (rare ones) of the content that are not in the dictionary
def removeRareTerm(self, vocabulary):
newcontent = []
for term in self.content:
if(vocabulary.has_key(term)):
newcontent.append(term)
self.content = newcontent
self.ntokens = len(newcontent)
# given an vocabulary, form as a doc vector (numpy vector)
def formNpVector(self, vocabulary):
v = np.zeros(len(vocabulary), dtype = 'float')
for term in self.content:
v[vocabulary[term]] += 1
self.vnp = v
return v
# form as a dict vector(more sparse)
def formDictVector(self):
v_dict = {}
for term in self.content:
if(not v_dict.has_key(term)):
v_dict[term] = 0
v_dict[term] += 1
self.vdict = v_dict
return v_dict
# form a full dict vector given the vocabulary, ignore the words not in the vocabulary
def formFullDictVector(self, vocabulary):
v_dict = {}
for term in self.content:
if(not vocabulary.has_key(term)):
continue
if(not v_dict.has_key(term)):
v_dict[term] = 0
v_dict[term] += 1
return v_dict
def printSelf(self):
tmp = " ".join(self.content)
print("Review id: " + str(self.id) + " Rating: "+ str(self.rating) + " Content: " + tmp + " Ntokens: " + str(self.ntokens) + " TS: " + self.ts + " Group: " + self.group + " Prob: " + str(self.prob) + " label: " + str(self.label) )
print("Raw text: " + str(self.text))
#print(self.vnp)
#print(self.vdict)
#print(self.tf_idf)