Source code for NXTfusion.DataMatrix


import pickle as cPickle
import numpy as np
from multipledispatch import dispatch
import time
import NXTfusion.NXTfusion as NX #import Entity
from scipy.sparse import coo_matrix

[docs]class SideInfo(object): """ Class that encapsulated the side information raw data in order to be efficiently processed by NXTfusion. You can use this class to wrap side information vectors analogously to how DataMatrix wraps matrix/relations. """ @dispatch(str, NX.Entity, dict) def __init__(self, name, ent1, data): """ One of the alternative constructors for the SideInfo class. Parameters ---------- name: str Name of the data matrix ent1 : Entity Entity object representing the object on the dimension 0 data: dict Dict containing ent1 objects as keys and feature vectors (side information) as values. Returns ------- """ self.name = name self.ent1 = ent1 self.dtype = type self.data = {} # data = {idx:[features]} missing = 0 l = len(data.values()[0]) for i in ent1: try: self.data[ent1[i]] = data[i] except: self.data[ent1[i]] = [0]*l missing += 1 print ("Missing: ", missing) for i in self.data.items(): assert len(i[1]) == len(self.data.values()[0]) @dispatch(str, NX.Entity, np.ndarray) def __init__(self, name, ent1, data): """ One of the alternative constructors for the SideInfo class. Parameters ---------- name: str Name of the data matrix ent1 : Entity Entity object representing the object on the dimension 0 data: numpy.ndarray Numpy array that contains the side information. It has shape (ent1 obj, feature length), similarly to a scikit-learn feature vector. Returns ------- """ self.name = name self.ent1 = ent1 self.dtype = type self.data = {} # data = {idx:[features]} missing = 0 assert len(ent1) == data.shape[0], "ERROR: data.shape[0] and len(ent1) do not match. You should provide one side info vector for each object in the entity." l = data.shape[1] for i in ent1: try: self.data[ent1[i]] = data[i] except: self.data[ent1[i]] = [0]*l missing += 1 print ("Missing: ", missing) @dispatch(str, NX.Entity, coo_matrix) def __init__(self, name, ent1, data): """ One of the alternative constructors for the SideInfo class. Parameters ---------- name: str Name of the data matrix ent1 : Entity Entity object representing the object on the dimension 0 data: scipy.sparse.coo_matrix Scipy coo_matrix that contains the side information. It has shape (ent1 obj, feature length), similarly to a scikit-learn feature vector. It can be sparse, but currently the sparsity during mini batching is NOT supported. Returns ------- """ self.name = name self.ent1 = ent1 self.dtype = type d = {} l = data.shape[1] data = data.toarray() print(data.shape) i = 0 while i < len(data.shape[1]): d[i] = data[i,:] data = d self.data = {} # data = {idx:[features]} missing = 0 for i in ent1: print (i, ent1[i], data[i]) try: self.data[ent1[i]] = data[i][:100] except: self.data[ent1[i]] = [0]*100 missing += 1 print ("Missing: ", missing) #for i in self.data.items(): # assert len(i[1]) == len(self.data.values()[0]) @dispatch( str) def __init__(self, path): """ This constructor reads a serialized (SideInfo.dump()) SideInfo object. Parameters ---------- name: str Path to the serialized SideInfo object. Returns ------- """ print( "Loading %s..." % path) start = time.time() store = cPickle.load(open(path)) stop = time.time() try: store["name"] store["ent1"] store["data"] except: print( "ERROR: wrong format, check file content") exit(1) self.name= store["name"] self.ent1 = store["ent1"] self.data = store["data"] print ("Done in %.2fs." % (stop-start))
[docs] def normalize(self): """ Method that standardizes the matrix with the formula x' = (x - mu)/s, where mu is the mean and s is the standard deviation. Returns ------- None """ l = [] for i in self.data.values(): l += list(i[1]) #print l print( len(l)) mu = np.mean(l) s = np.std(l) print ("mu = %f, s= %f" % (mu, s)) for i in self.data.items(): self.data[i[0]] = [i[1][0], (i[1][1]-mu)/s]
def __len__(self): return len(self.data) def __getitem__(self, x): return self.data[x]
[docs] def dump(self, path=None): """ Method that serializes the SideInfo storing it at the selected path. path: str Destination path for the serialized file Returns ------- None """ if path == None: print ("Storing...") start = time.time() store = {"name":self.name, "ent1":self.ent1, "data":self.data} cPickle.dump(store, open("marshalled/"+self.name+".side.nx", "w")) stop = time.time() print ("Done in %.2fs." % (stop-start)) else: print ("Storing...") start = time.time() store = {"name":self.name, "ent1":self.ent1, "data":self.data} cPickle.dump(store, open(path, "w")) stop = time.time() print ("Done in %.2fs." % (stop-start))
[docs]class DataMatrix(object): """ The input "data" format should be: {(ent1, ent2): value} for all the observed elements in the matrix. The format in which the data is stored in the DataMatrix object is the following: featsHT = {domain1Name_numeric : [ numpy16_domain2Names_numeric, numpyX_labels ]} """ @dispatch(str, NX.Entity, NX.Entity, dict, type) def __init__(self, name:str, ent1:NX.Entity, ent2:NX.Entity, data:dict, dtype:type) : """One of the alternative constructors for the DataMatrix class. Parameters ---------- name: str Name of the data matrix ent1 : Entity Entity object representing the object on the dimension 0 ent2: Entity Entity object representing the object on the dimension 1 data: dict {(ent1, ent2): value} Hash table containing the (sparse) elements and in the matrix describing the relation. The input "data" format should be: {(ent1, ent2): value} for all the observed elements in the matrix. dtype: numpy.dtype The smallest possible type that could be used to store the elements of the matrix (e.g. np.int16) Returns ------- """ self.name = name self.ent1 = ent1 self.ent2 = ent2 self.dtype = type print( "Building features for matrix %s..." % name) self.data, self.size = buildPytorchFeatsHT(data, ent1, ent2, dtype) #print (list(self.data.items())[:3]) print ("Size: ", self.size) @dispatch(str, NX.Entity, NX.Entity, coo_matrix) def __init__(self, name:str, ent1:NX.Entity, ent2:NX.Entity, data:coo_matrix) : """One of the alternative constructors for the DataMatrix class. Parameters ---------- name: str Name of the data matrix ent1 : Entity Entity object representing the object on the dimension 0 ent2: Entity Entity object representing the object on the dimension 1 data: coo_matrix scipy.sparse.coo_matrix containing the sparse elements and in the matrix describing the relation. Returns ------- """ self.name = name self.ent1 = ent1 self.ent2 = ent2 self.dtype = type print( "Building features for matrix %s..." % name) d = {} data = data.todok() for i in data.items(): d[i[0]] = i[1] self.data, self.size = buildPytorchFeatsHT(d, ent1, ent2, data.dtype) #print (list(self.data.items())[:3]) print ("Size: ", self.size) @dispatch(str, NX.Entity, NX.Entity, np.ndarray) def __init__(self, name:str, ent1:NX.Entity, ent2:NX.Entity, data:np.ndarray) : """ One of the alternative constructors for the DataMatrix class. Parameters ---------- name: str Name of the data matrix ent1 : Entity Entity object representing the object on the dimension 0 ent2: Entity Entity object representing the object on the dimension 1 data: numpy.ndarray (matrix) Numpy matrix containing the (dense) describing the relation between ent1 and en2. Returns ------- DataMatrix object """ self.name = name self.ent1 = ent1 self.ent2 = ent2 assert len(ent1) == data.shape[0], "ERROR: ent1 len is "+str(len(ent1))+" but data.shape[0] is"+str(data.shape[0]) assert len(ent2) == data.shape[1], "ERROR: ent2 len is "+str(len(ent2))+" but data.shape[0] is"+str(data.shape[0]) print( "Building features for matrix %s..." % name) self.data, self.size = buildPytorchFeatsHTfromNumpy(data, self.ent1, self.ent2) #print (list(self.data.items())[:3]) print ("Size: ", self.size) @dispatch(str, np.ndarray) def __init__(self, name:str, data:np.ndarray, dtype:type) : """Simplest possible constructor for the DataMatrix class. Entities are inferred. Parameters ---------- name: str Name of the data matrix data: numpy.ndarray (matrix) Numpy matrix containing the (dense) describing the relation between ent1 and en2. Returns ------- """ self.name = name self.ent1 = NX.Entity(self.name+"_0", list(range(data.shape[0])), dtype = dtype) self.ent2 = NX.Entity(self.name+"_1", list(range(data.shape[1])), dtype = dtype) print( "Building features for matrix %s..." % name) self.data, self.size = buildPytorchFeatsHTfromNumpy(data, self.ent1, self.ent2) #print (list(self.data.items())[:3]) print ("Size: ", self.size)
[docs] def size(self): """ Function that return the size of the relation (number of elements in the matrix). Returns ------- Size of the relation in the DataMatrix object """ size = 0 for i in data.items(): size += len(i[1][0]) print ("Size: ", size) return size
@dispatch( str) def __init__(self, path): """Constructor that reads the DataMatrix from a previously serialized DataMatrix object. Parameters ---------- path: str Path of the serialized DataMatrix Returns ------- """ print( "Loading %s..." % path) start = time.time() store = cPickle.load(open(path)) stop = time.time() try: store["name"] store["ent1"] store["ent2"] store["data"] except: print ("ERROR: wrong format, check file content") exit(1) self.name= store["name"] self.ent1 = store["ent1"] self.ent2 = store["ent2"] self.data = store["data"] print ("Done in %.2fs." % (stop-start))
[docs] def standardize(self): """ Method that standardizes the matrix with the formula x' = (x - mu)/s, where mu is the mean and s is the standard deviation. Returns ------- None """ l = [] for i in self.data.values(): l += list(i[1]) #print l[:100] #print len(l) mu = np.mean(l) s = np.std(l) print ("mu = %f, s= %f" % (mu, s)) tmp = [] for i in self.data.items(): res = (i[1][1]-mu)/s #print res #raw_input() self.data[i[0]] = [i[1][0], res] tmp += res.tolist() #print len(tmp) #print tmp[:100] tmu = np.mean(tmp) ts = np.std(tmp) #print tmu, ts assert abs(tmu) < 0.0001 assert abs(ts-1) < 0.0001
[docs] def toHashTable(self)-> dict: """ Method that returns an hash table (dict) containing the DataMatrix data. Returns ------- dict """ db = {} for p1 in self.data.items(): #print p1 i = 0 assert len(p1[1][0]) == len(p1[1][1]) while i < len(p1[1][0]): db[tuple(sorted([self.ent1[p1[0]], self.ent2[int(p1[1][0][i])]]))] = p1[1][1][i] i+=1 print( "Found %d entries" % len(db)) #print db.items()[:10] return db
def dump(self, path= None): if path == None: print ("Storing...") start = time.time() store = {"name":self.name, "ent1":self.ent1, "ent2":self.ent2, "data":self.data} cPickle.dump(store, open("marshalled/"+self.name+".nx", "w")) stop = time.time() print ("Done in %.2fs." % (stop-start)) else: print ("Storing...") start = time.time() store = {"name":self.name, "ent1":self.ent1, "data":self.data} cPickle.dump(store, open(path, "w")) stop = time.time() print ("Done in %.2fs." % (stop-start))
@dispatch(np.ndarray, NX.Entity, NX.Entity) def buildPytorchFeatsHTfromNumpy(data:np.ndarray, domain1:NX.Entity, domain2:NX.Entity) -> (dict, int): """ This function produces the data structure that is internally used to pass training data to the wrapper.fit function. This is now transparent to the user. Parameters ---------- data: numpy.ndarray Numpy matrix containing the matrix that represents the relation between ent1 and ent2 domain1 : Entity Entity1 domain2 : Entity Entity2 Returns ------- Dict internally used to feed the data to the NX.Wrapper object. The format is the following featsHT = {domain1Name_numeric : [ numpy16_domain2Names_numeric, numpyX_labels ]} """ size = 0 x = {} i = 0 while i < len(domain1): x[i] = [[],[]] i+=1 if domain1 == domain2: print( " *** identified as self relation.") selfRelation = True else: print (" *** identified as asymmetric relation.") selfRelation = False for i in range(0, len(domain1)): for j in range(0, len(domain2)): if selfRelation: try: tmp = tuple(sorted([i,j])) #probably dangerous here except: continue else: tmp = tuple([i,j]) x[tmp[0]][0].append(tmp[1]) x[tmp[0]][1].append(data[i,j]) xf = {} for i in x.items(): size += len(i[1][0]) xf[i[0]] = [np.array(i[1][0], dtype=domain1.dtype), np.array(i[1][1], dtype=data.dtype)] return xf, size @dispatch(dict, NX.Entity, NX.Entity, np.dtype) def buildPytorchFeatsHT(data:dict, domain1:NX.Entity, domain2:NX.Entity, relDtype:type)-> (dict, int): """ This function produces the data structure that is internally used to pass training data to the wrapper.fit function. This is now transparent to the user. Parameters ---------- data: dict {(ent1[i], ent2[j]): value[i,j]} Dict in the following format: {(ent1[i], ent2[j]): value[i,j]} containing the matrix that represents the relation between ent1 and ent2. This format can be used to input sparse matrices domain1 : Entity Entity1 domain2 : Entity Entity2 relDtype : numpy.dtype The smallest np.dtype sufficient to represent the values in the matrix. Returns ------- Dict internally used to feed the data to the NX.Wrapper object. The format is the following featsHT = {domain1Name_numeric : [ numpy16_domain2Names_numeric, numpyX_labels ]} """ size = 0 x = {} i = 0 while i < len(domain1): x[i] = [[],[]] i+=1 if domain1 == domain2: print( " *** identified as self relation.") selfRelation = True else: print (" *** identified as asymmetric relation.") selfRelation = False for i in data.items(): if selfRelation: try: tmp = tuple(sorted([domain1[i[0][0]], domain2[i[0][1]]])) #probably dangerous here except: continue else: tmp = tuple([domain1[i[0][0]], domain2[i[0][1]]]) x[tmp[0]][0].append(tmp[1]) x[tmp[0]][1].append(i[1]) xf = {} for i in x.items(): size += len(i[1][0]) xf[i[0]] = [np.array(i[1][0], dtype=domain1.dtype), np.array(i[1][1], dtype=relDtype)] return xf, size