createfaketrainingsamples_fodorszagats.py

# -*- coding: utf-8 -*-
"""createFakeTrainingSamples_FodorsZagats.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1W7SBuSpqDG6QwQoXw0gasa4YraYyhNhd
"""

from google.colab import drive
drive.mount('/content/drive')

import numpy as np
GLOVE_DIR='drive/My Drive/deeper/glove'
GLOVE_FILENAME='glove.840B.300d.txt'
wordToEmbeddingMap = {}
with open(GLOVE_DIR+'/'+GLOVE_FILENAME) as f:
  for line in f:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, 'f', sep=' ')
    # we found out that glove.840B.300d.txt is supposed to contain 301-item lines (the token and its 300 weights) but some lines do contain 
    # more than one str tokens, so we ignore those malformed lines (whose corresponding coefs variable is an empty array)
    if len(coefs) != 0:
      wordToEmbeddingMap[word] = coefs

import pandas as pd
TABLEA_FILEPATH='tableA.csv'
TABLEB_FILEPATH = 'tableB.csv'
tableADf= pd.read_csv('tableA.csv')
tableBDf = pd.read_csv('tableB.csv')

import nltk
stopwords = nltk.download("stopwords")

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

len(stop_words)

tableADf.head(5)

tableBDf.head(5)

cleanTableADf = pd.DataFrame(columns=('name','addr','city','phone','type','class'))
cleanTableADf

def createCleanMatrix(df):
  cleanMatrix = []
  for i,row in df.iterrows():
    name = row['name']
    addr = row['addr']
    city = row['city']
    phone = row['phone'] 
    types = row['type']
    classes = row['class']

    tokens_name = tokenizer.tokenize(name)
    tokens_addr = tokenizer.tokenize(addr)
    tokens_city = tokenizer.tokenize(city)
    tokens_phone = tokenizer.tokenize(phone)
    tokens_type = tokenizer.tokenize(types)
    #tokens_class = tokenizer.tokenize(classes)

    filtered_name = [word for word in tokens_name if word not in stop_words]
    filtered_addr = [word for word in tokens_addr if word not in stop_words]
    filtered_city = [word for word in tokens_city if word not in stop_words]
    filtered_phone = [word for word in tokens_phone if word not in stop_words]
    filtered_type = [word for word in tokens_type if word not in stop_words]
    #filtered_class = [word for word in tokens_class if word not in stop_words]
    cleanMatrix.append([filtered_name,filtered_addr,filtered_city,filtered_phone,filtered_type,[classes]])
  cleanMatrix = np.array(cleanMatrix)
  return cleanMatrix

cleanMatrixA = createCleanMatrix(tableADf)
cleanMatrixB = createCleanMatrix(tableBDf)

cleanMatrixB.shape

"""Creazione del vettore degli embedding"""

def createDistributedRapresentation(cleanMatrix):
  embeddingMap = {}
  i = 0
  for record in cleanMatrix:
    tupleEmbedding = np.array([])
    for attribute in record:
      ntokens = 0
      numeratoreVec = np.zeros(300)
      for token in attribute:
        ntokens += 1
        embeddingVector = wordToEmbeddingMap.get(token)
        if embeddingVector is not None:
          numeratoreVec += embeddingVector
      attributeEmbedding = numeratoreVec/ntokens
      tupleEmbedding = np.append(tupleEmbedding,attributeEmbedding)
    embeddingMap[i] = tupleEmbedding
    i += 1
  return embeddingMap

embeddingMapTableA = createDistributedRapresentation(cleanMatrixA)
embeddingMapTableB = createDistributedRapresentation(cleanMatrixB)

from sklearn.metrics.pairwise import cosine_similarity

embedding55 = embeddingMapTableA[159].reshape(1,-1)
embedding273 = embeddingMapTableB[56].reshape(1,-1)

cosineSim = cosine_similarity(embedding55,embedding273)
cosineSim