-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreatefaketrainingsamples_fodorszagats.py
112 lines (90 loc) · 3.63 KB
/
createfaketrainingsamples_fodorszagats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# -*- coding: utf-8 -*-
"""createFakeTrainingSamples_FodorsZagats.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1W7SBuSpqDG6QwQoXw0gasa4YraYyhNhd
"""
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
GLOVE_DIR='drive/My Drive/deeper/glove'
GLOVE_FILENAME='glove.840B.300d.txt'
wordToEmbeddingMap = {}
with open(GLOVE_DIR+'/'+GLOVE_FILENAME) as f:
for line in f:
word, coefs = line.split(maxsplit=1)
coefs = np.fromstring(coefs, 'f', sep=' ')
# we found out that glove.840B.300d.txt is supposed to contain 301-item lines (the token and its 300 weights) but some lines do contain
# more than one str tokens, so we ignore those malformed lines (whose corresponding coefs variable is an empty array)
if len(coefs) != 0:
wordToEmbeddingMap[word] = coefs
import pandas as pd
TABLEA_FILEPATH='tableA.csv'
TABLEB_FILEPATH = 'tableB.csv'
tableADf= pd.read_csv('tableA.csv')
tableBDf = pd.read_csv('tableB.csv')
import nltk
stopwords = nltk.download("stopwords")
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
len(stop_words)
tableADf.head(5)
tableBDf.head(5)
cleanTableADf = pd.DataFrame(columns=('name','addr','city','phone','type','class'))
cleanTableADf
def createCleanMatrix(df):
cleanMatrix = []
for i,row in df.iterrows():
name = row['name']
addr = row['addr']
city = row['city']
phone = row['phone']
types = row['type']
classes = row['class']
tokens_name = tokenizer.tokenize(name)
tokens_addr = tokenizer.tokenize(addr)
tokens_city = tokenizer.tokenize(city)
tokens_phone = tokenizer.tokenize(phone)
tokens_type = tokenizer.tokenize(types)
#tokens_class = tokenizer.tokenize(classes)
filtered_name = [word for word in tokens_name if word not in stop_words]
filtered_addr = [word for word in tokens_addr if word not in stop_words]
filtered_city = [word for word in tokens_city if word not in stop_words]
filtered_phone = [word for word in tokens_phone if word not in stop_words]
filtered_type = [word for word in tokens_type if word not in stop_words]
#filtered_class = [word for word in tokens_class if word not in stop_words]
cleanMatrix.append([filtered_name,filtered_addr,filtered_city,filtered_phone,filtered_type,[classes]])
cleanMatrix = np.array(cleanMatrix)
return cleanMatrix
cleanMatrixA = createCleanMatrix(tableADf)
cleanMatrixB = createCleanMatrix(tableBDf)
cleanMatrixB.shape
"""Creazione del vettore degli embedding"""
def createDistributedRapresentation(cleanMatrix):
embeddingMap = {}
i = 0
for record in cleanMatrix:
tupleEmbedding = np.array([])
for attribute in record:
ntokens = 0
numeratoreVec = np.zeros(300)
for token in attribute:
ntokens += 1
embeddingVector = wordToEmbeddingMap.get(token)
if embeddingVector is not None:
numeratoreVec += embeddingVector
attributeEmbedding = numeratoreVec/ntokens
tupleEmbedding = np.append(tupleEmbedding,attributeEmbedding)
embeddingMap[i] = tupleEmbedding
i += 1
return embeddingMap
embeddingMapTableA = createDistributedRapresentation(cleanMatrixA)
embeddingMapTableB = createDistributedRapresentation(cleanMatrixB)
from sklearn.metrics.pairwise import cosine_similarity
embedding55 = embeddingMapTableA[159].reshape(1,-1)
embedding273 = embeddingMapTableB[56].reshape(1,-1)
cosineSim = cosine_similarity(embedding55,embedding273)
cosineSim