1
+ # -*- coding: utf-8 -*-
2
+ """createFakeTrainingSamples_FodorsZagats.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1W7SBuSpqDG6QwQoXw0gasa4YraYyhNhd
8
+ """
9
+
10
+ from google .colab import drive
11
+ drive .mount ('/content/drive' )
12
+
13
+ import numpy as np
14
+ GLOVE_DIR = 'drive/My Drive/deeper/glove'
15
+ GLOVE_FILENAME = 'glove.840B.300d.txt'
16
+ wordToEmbeddingMap = {}
17
+ with open (GLOVE_DIR + '/' + GLOVE_FILENAME ) as f :
18
+ for line in f :
19
+ word , coefs = line .split (maxsplit = 1 )
20
+ coefs = np .fromstring (coefs , 'f' , sep = ' ' )
21
+ # we found out that glove.840B.300d.txt is supposed to contain 301-item lines (the token and its 300 weights) but some lines do contain
22
+ # more than one str tokens, so we ignore those malformed lines (whose corresponding coefs variable is an empty array)
23
+ if len (coefs ) != 0 :
24
+ wordToEmbeddingMap [word ] = coefs
25
+
26
+ import pandas as pd
27
+ TABLEA_FILEPATH = 'tableA.csv'
28
+ TABLEB_FILEPATH = 'tableB.csv'
29
+ tableADf = pd .read_csv ('tableA.csv' )
30
+ tableBDf = pd .read_csv ('tableB.csv' )
31
+
32
+ import nltk
33
+ stopwords = nltk .download ("stopwords" )
34
+
35
+ from nltk .tokenize import RegexpTokenizer
36
+ from nltk .corpus import stopwords
37
+ tokenizer = RegexpTokenizer (r'\w+' )
38
+ stop_words = set (stopwords .words ('english' ))
39
+ stop_words .update (['.' , ',' , '"' , "'" , ':' , ';' , '(' , ')' , '[' , ']' , '{' , '}' ])
40
+
41
+ len (stop_words )
42
+
43
+ tableADf .head (5 )
44
+
45
+ tableBDf .head (5 )
46
+
47
+ cleanTableADf = pd .DataFrame (columns = ('name' ,'addr' ,'city' ,'phone' ,'type' ,'class' ))
48
+ cleanTableADf
49
+
50
+ def createCleanMatrix (df ):
51
+ cleanMatrix = []
52
+ for i ,row in df .iterrows ():
53
+ name = row ['name' ]
54
+ addr = row ['addr' ]
55
+ city = row ['city' ]
56
+ phone = row ['phone' ]
57
+ types = row ['type' ]
58
+ classes = row ['class' ]
59
+
60
+ tokens_name = tokenizer .tokenize (name )
61
+ tokens_addr = tokenizer .tokenize (addr )
62
+ tokens_city = tokenizer .tokenize (city )
63
+ tokens_phone = tokenizer .tokenize (phone )
64
+ tokens_type = tokenizer .tokenize (types )
65
+ #tokens_class = tokenizer.tokenize(classes)
66
+
67
+ filtered_name = [word for word in tokens_name if word not in stop_words ]
68
+ filtered_addr = [word for word in tokens_addr if word not in stop_words ]
69
+ filtered_city = [word for word in tokens_city if word not in stop_words ]
70
+ filtered_phone = [word for word in tokens_phone if word not in stop_words ]
71
+ filtered_type = [word for word in tokens_type if word not in stop_words ]
72
+ #filtered_class = [word for word in tokens_class if word not in stop_words]
73
+ cleanMatrix .append ([filtered_name ,filtered_addr ,filtered_city ,filtered_phone ,filtered_type ,[classes ]])
74
+ cleanMatrix = np .array (cleanMatrix )
75
+ return cleanMatrix
76
+
77
+ cleanMatrixA = createCleanMatrix (tableADf )
78
+ cleanMatrixB = createCleanMatrix (tableBDf )
79
+
80
+ cleanMatrixB .shape
81
+
82
+ """Creazione del vettore degli embedding"""
83
+
84
+ def createDistributedRapresentation (cleanMatrix ):
85
+ embeddingMap = {}
86
+ i = 0
87
+ for record in cleanMatrix :
88
+ tupleEmbedding = np .array ([])
89
+ for attribute in record :
90
+ ntokens = 0
91
+ numeratoreVec = np .zeros (300 )
92
+ for token in attribute :
93
+ ntokens += 1
94
+ embeddingVector = wordToEmbeddingMap .get (token )
95
+ if embeddingVector is not None :
96
+ numeratoreVec += embeddingVector
97
+ attributeEmbedding = numeratoreVec / ntokens
98
+ tupleEmbedding = np .append (tupleEmbedding ,attributeEmbedding )
99
+ embeddingMap [i ] = tupleEmbedding
100
+ i += 1
101
+ return embeddingMap
102
+
103
+ embeddingMapTableA = createDistributedRapresentation (cleanMatrixA )
104
+ embeddingMapTableB = createDistributedRapresentation (cleanMatrixB )
105
+
106
+ from sklearn .metrics .pairwise import cosine_similarity
107
+
108
+ embedding55 = embeddingMapTableA [159 ].reshape (1 ,- 1 )
109
+ embedding273 = embeddingMapTableB [56 ].reshape (1 ,- 1 )
110
+
111
+ cosineSim = cosine_similarity (embedding55 ,embedding273 )
112
+ cosineSim
0 commit comments