Add files via upload

vincenzomartello · web-flow · commit ec3ebce9310c · 2019-07-09T13:01:28.000+02:00
Added method to compute distributed representation by average
diff --git a/createfaketrainingsamples_fodorszagats.py b/createfaketrainingsamples_fodorszagats.py
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+"""createFakeTrainingSamples_FodorsZagats.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1W7SBuSpqDG6QwQoXw0gasa4YraYyhNhd
+"""
+
+from google.colab import drive
+drive.mount('/content/drive')
+
+import numpy as np
+GLOVE_DIR='drive/My Drive/deeper/glove'
+GLOVE_FILENAME='glove.840B.300d.txt'
+wordToEmbeddingMap = {}
+with open(GLOVE_DIR+'/'+GLOVE_FILENAME) as f:
+  for line in f:
+    word, coefs = line.split(maxsplit=1)
+    coefs = np.fromstring(coefs, 'f', sep=' ')
+    # we found out that glove.840B.300d.txt is supposed to contain 301-item lines (the token and its 300 weights) but some lines do contain 
+    # more than one str tokens, so we ignore those malformed lines (whose corresponding coefs variable is an empty array)
+    if len(coefs) != 0:
+      wordToEmbeddingMap[word] = coefs
+
+import pandas as pd
+TABLEA_FILEPATH='tableA.csv'
+TABLEB_FILEPATH = 'tableB.csv'
+tableADf= pd.read_csv('tableA.csv')
+tableBDf = pd.read_csv('tableB.csv')
+
+import nltk
+stopwords = nltk.download("stopwords")
+
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus import stopwords
+tokenizer = RegexpTokenizer(r'\w+')
+stop_words = set(stopwords.words('english'))
+stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
+
+len(stop_words)
+
+tableADf.head(5)
+
+tableBDf.head(5)
+
+cleanTableADf = pd.DataFrame(columns=('name','addr','city','phone','type','class'))
+cleanTableADf
+
+def createCleanMatrix(df):
+  cleanMatrix = []
+  for i,row in df.iterrows():
+    name = row['name']
+    addr = row['addr']
+    city = row['city']
+    phone = row['phone'] 
+    types = row['type']
+    classes = row['class']
+
+    tokens_name = tokenizer.tokenize(name)
+    tokens_addr = tokenizer.tokenize(addr)
+    tokens_city = tokenizer.tokenize(city)
+    tokens_phone = tokenizer.tokenize(phone)
+    tokens_type = tokenizer.tokenize(types)
+    #tokens_class = tokenizer.tokenize(classes)
+
+    filtered_name = [word for word in tokens_name if word not in stop_words]
+    filtered_addr = [word for word in tokens_addr if word not in stop_words]
+    filtered_city = [word for word in tokens_city if word not in stop_words]
+    filtered_phone = [word for word in tokens_phone if word not in stop_words]
+    filtered_type = [word for word in tokens_type if word not in stop_words]
+    #filtered_class = [word for word in tokens_class if word not in stop_words]
+    cleanMatrix.append([filtered_name,filtered_addr,filtered_city,filtered_phone,filtered_type,[classes]])
+  cleanMatrix = np.array(cleanMatrix)
+  return cleanMatrix
+
+cleanMatrixA = createCleanMatrix(tableADf)
+cleanMatrixB = createCleanMatrix(tableBDf)
+
+cleanMatrixB.shape
+
+"""Creazione del vettore degli embedding"""
+
+def createDistributedRapresentation(cleanMatrix):
+  embeddingMap = {}
+  i = 0
+  for record in cleanMatrix:
+    tupleEmbedding = np.array([])
+    for attribute in record:
+      ntokens = 0
+      numeratoreVec = np.zeros(300)
+      for token in attribute:
+        ntokens += 1
+        embeddingVector = wordToEmbeddingMap.get(token)
+        if embeddingVector is not None:
+          numeratoreVec += embeddingVector
+      attributeEmbedding = numeratoreVec/ntokens
+      tupleEmbedding = np.append(tupleEmbedding,attributeEmbedding)
+    embeddingMap[i] = tupleEmbedding
+    i += 1
+  return embeddingMap
+
+embeddingMapTableA = createDistributedRapresentation(cleanMatrixA)
+embeddingMapTableB = createDistributedRapresentation(cleanMatrixB)
+
+from sklearn.metrics.pairwise import cosine_similarity
+
+embedding55 = embeddingMapTableA[159].reshape(1,-1)
+embedding273 = embeddingMapTableB[56].reshape(1,-1)
+
+cosineSim = cosine_similarity(embedding55,embedding273)
+cosineSim