Skip to content

Commit ec3ebce

Browse files
Add files via upload
Added method to compute distributed representation by average
1 parent b6e8caa commit ec3ebce

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed
+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# -*- coding: utf-8 -*-
2+
"""createFakeTrainingSamples_FodorsZagats.ipynb
3+
4+
Automatically generated by Colaboratory.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/1W7SBuSpqDG6QwQoXw0gasa4YraYyhNhd
8+
"""
9+
10+
from google.colab import drive
11+
drive.mount('/content/drive')
12+
13+
import numpy as np
14+
GLOVE_DIR='drive/My Drive/deeper/glove'
15+
GLOVE_FILENAME='glove.840B.300d.txt'
16+
wordToEmbeddingMap = {}
17+
with open(GLOVE_DIR+'/'+GLOVE_FILENAME) as f:
18+
for line in f:
19+
word, coefs = line.split(maxsplit=1)
20+
coefs = np.fromstring(coefs, 'f', sep=' ')
21+
# we found out that glove.840B.300d.txt is supposed to contain 301-item lines (the token and its 300 weights) but some lines do contain
22+
# more than one str tokens, so we ignore those malformed lines (whose corresponding coefs variable is an empty array)
23+
if len(coefs) != 0:
24+
wordToEmbeddingMap[word] = coefs
25+
26+
import pandas as pd
27+
TABLEA_FILEPATH='tableA.csv'
28+
TABLEB_FILEPATH = 'tableB.csv'
29+
tableADf= pd.read_csv('tableA.csv')
30+
tableBDf = pd.read_csv('tableB.csv')
31+
32+
import nltk
33+
stopwords = nltk.download("stopwords")
34+
35+
from nltk.tokenize import RegexpTokenizer
36+
from nltk.corpus import stopwords
37+
tokenizer = RegexpTokenizer(r'\w+')
38+
stop_words = set(stopwords.words('english'))
39+
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
40+
41+
len(stop_words)
42+
43+
tableADf.head(5)
44+
45+
tableBDf.head(5)
46+
47+
cleanTableADf = pd.DataFrame(columns=('name','addr','city','phone','type','class'))
48+
cleanTableADf
49+
50+
def createCleanMatrix(df):
51+
cleanMatrix = []
52+
for i,row in df.iterrows():
53+
name = row['name']
54+
addr = row['addr']
55+
city = row['city']
56+
phone = row['phone']
57+
types = row['type']
58+
classes = row['class']
59+
60+
tokens_name = tokenizer.tokenize(name)
61+
tokens_addr = tokenizer.tokenize(addr)
62+
tokens_city = tokenizer.tokenize(city)
63+
tokens_phone = tokenizer.tokenize(phone)
64+
tokens_type = tokenizer.tokenize(types)
65+
#tokens_class = tokenizer.tokenize(classes)
66+
67+
filtered_name = [word for word in tokens_name if word not in stop_words]
68+
filtered_addr = [word for word in tokens_addr if word not in stop_words]
69+
filtered_city = [word for word in tokens_city if word not in stop_words]
70+
filtered_phone = [word for word in tokens_phone if word not in stop_words]
71+
filtered_type = [word for word in tokens_type if word not in stop_words]
72+
#filtered_class = [word for word in tokens_class if word not in stop_words]
73+
cleanMatrix.append([filtered_name,filtered_addr,filtered_city,filtered_phone,filtered_type,[classes]])
74+
cleanMatrix = np.array(cleanMatrix)
75+
return cleanMatrix
76+
77+
cleanMatrixA = createCleanMatrix(tableADf)
78+
cleanMatrixB = createCleanMatrix(tableBDf)
79+
80+
cleanMatrixB.shape
81+
82+
"""Creazione del vettore degli embedding"""
83+
84+
def createDistributedRapresentation(cleanMatrix):
85+
embeddingMap = {}
86+
i = 0
87+
for record in cleanMatrix:
88+
tupleEmbedding = np.array([])
89+
for attribute in record:
90+
ntokens = 0
91+
numeratoreVec = np.zeros(300)
92+
for token in attribute:
93+
ntokens += 1
94+
embeddingVector = wordToEmbeddingMap.get(token)
95+
if embeddingVector is not None:
96+
numeratoreVec += embeddingVector
97+
attributeEmbedding = numeratoreVec/ntokens
98+
tupleEmbedding = np.append(tupleEmbedding,attributeEmbedding)
99+
embeddingMap[i] = tupleEmbedding
100+
i += 1
101+
return embeddingMap
102+
103+
embeddingMapTableA = createDistributedRapresentation(cleanMatrixA)
104+
embeddingMapTableB = createDistributedRapresentation(cleanMatrixB)
105+
106+
from sklearn.metrics.pairwise import cosine_similarity
107+
108+
embedding55 = embeddingMapTableA[159].reshape(1,-1)
109+
embedding273 = embeddingMapTableB[56].reshape(1,-1)
110+
111+
cosineSim = cosine_similarity(embedding55,embedding273)
112+
cosineSim

0 commit comments

Comments
 (0)