import pickle as cPickle
import numpy as np
from multipledispatch import dispatch
import time
import NXTfusion.NXTfusion as NX #import Entity
from scipy.sparse import coo_matrix
[docs]class SideInfo(object):
"""
Class that encapsulated the side information raw data in order to be efficiently processed by NXTfusion. You can use this class to wrap side information vectors analogously to how DataMatrix wraps matrix/relations.
"""
@dispatch(str, NX.Entity, dict)
def __init__(self, name, ent1, data):
"""
One of the alternative constructors for the SideInfo class.
Parameters
----------
name: str
Name of the data matrix
ent1 : Entity
Entity object representing the object on the dimension 0
data: dict
Dict containing ent1 objects as keys and feature vectors (side information) as values.
Returns
-------
"""
self.name = name
self.ent1 = ent1
self.dtype = type
self.data = {} # data = {idx:[features]}
missing = 0
l = len(data.values()[0])
for i in ent1:
try:
self.data[ent1[i]] = data[i]
except:
self.data[ent1[i]] = [0]*l
missing += 1
print ("Missing: ", missing)
for i in self.data.items():
assert len(i[1]) == len(self.data.values()[0])
@dispatch(str, NX.Entity, np.ndarray)
def __init__(self, name, ent1, data):
"""
One of the alternative constructors for the SideInfo class.
Parameters
----------
name: str
Name of the data matrix
ent1 : Entity
Entity object representing the object on the dimension 0
data: numpy.ndarray
Numpy array that contains the side information. It has shape (ent1 obj, feature length), similarly to a scikit-learn feature vector.
Returns
-------
"""
self.name = name
self.ent1 = ent1
self.dtype = type
self.data = {} # data = {idx:[features]}
missing = 0
assert len(ent1) == data.shape[0], "ERROR: data.shape[0] and len(ent1) do not match. You should provide one side info vector for each object in the entity."
l = data.shape[1]
for i in ent1:
try:
self.data[ent1[i]] = data[i]
except:
self.data[ent1[i]] = [0]*l
missing += 1
print ("Missing: ", missing)
@dispatch(str, NX.Entity, coo_matrix)
def __init__(self, name, ent1, data):
"""
One of the alternative constructors for the SideInfo class.
Parameters
----------
name: str
Name of the data matrix
ent1 : Entity
Entity object representing the object on the dimension 0
data: scipy.sparse.coo_matrix
Scipy coo_matrix that contains the side information. It has shape (ent1 obj, feature length), similarly to a scikit-learn feature vector. It can be sparse, but currently the sparsity during mini batching is NOT supported.
Returns
-------
"""
self.name = name
self.ent1 = ent1
self.dtype = type
d = {}
l = data.shape[1]
data = data.toarray()
print(data.shape)
i = 0
while i < len(data.shape[1]):
d[i] = data[i,:]
data = d
self.data = {} # data = {idx:[features]}
missing = 0
for i in ent1:
print (i, ent1[i], data[i])
try:
self.data[ent1[i]] = data[i][:100]
except:
self.data[ent1[i]] = [0]*100
missing += 1
print ("Missing: ", missing)
#for i in self.data.items():
# assert len(i[1]) == len(self.data.values()[0])
@dispatch( str)
def __init__(self, path):
"""
This constructor reads a serialized (SideInfo.dump()) SideInfo object.
Parameters
----------
name: str
Path to the serialized SideInfo object.
Returns
-------
"""
print( "Loading %s..." % path)
start = time.time()
store = cPickle.load(open(path))
stop = time.time()
try:
store["name"]
store["ent1"]
store["data"]
except:
print( "ERROR: wrong format, check file content")
exit(1)
self.name= store["name"]
self.ent1 = store["ent1"]
self.data = store["data"]
print ("Done in %.2fs." % (stop-start))
[docs] def normalize(self):
"""
Method that standardizes the matrix with the formula x' = (x - mu)/s, where mu is the mean and s is the standard deviation.
Returns
-------
None
"""
l = []
for i in self.data.values():
l += list(i[1])
#print l
print( len(l))
mu = np.mean(l)
s = np.std(l)
print ("mu = %f, s= %f" % (mu, s))
for i in self.data.items():
self.data[i[0]] = [i[1][0], (i[1][1]-mu)/s]
def __len__(self):
return len(self.data)
def __getitem__(self, x):
return self.data[x]
[docs] def dump(self, path=None):
"""
Method that serializes the SideInfo storing it at the selected path.
path: str
Destination path for the serialized file
Returns
-------
None
"""
if path == None:
print ("Storing...")
start = time.time()
store = {"name":self.name, "ent1":self.ent1, "data":self.data}
cPickle.dump(store, open("marshalled/"+self.name+".side.nx", "w"))
stop = time.time()
print ("Done in %.2fs." % (stop-start))
else:
print ("Storing...")
start = time.time()
store = {"name":self.name, "ent1":self.ent1, "data":self.data}
cPickle.dump(store, open(path, "w"))
stop = time.time()
print ("Done in %.2fs." % (stop-start))
[docs]class DataMatrix(object):
"""
The input "data" format should be: {(ent1, ent2): value} for all the observed elements in the matrix.
The format in which the data is stored in the DataMatrix object is the following:
featsHT = {domain1Name_numeric : [ numpy16_domain2Names_numeric, numpyX_labels ]}
"""
@dispatch(str, NX.Entity, NX.Entity, dict, type)
def __init__(self, name:str, ent1:NX.Entity, ent2:NX.Entity, data:dict, dtype:type) :
"""One of the alternative constructors for the DataMatrix class.
Parameters
----------
name: str
Name of the data matrix
ent1 : Entity
Entity object representing the object on the dimension 0
ent2: Entity
Entity object representing the object on the dimension 1
data: dict {(ent1, ent2): value}
Hash table containing the (sparse) elements and in the matrix describing the relation. The input "data" format should be: {(ent1, ent2): value} for all the observed elements in the matrix.
dtype: numpy.dtype
The smallest possible type that could be used to store the elements of the matrix (e.g. np.int16)
Returns
-------
"""
self.name = name
self.ent1 = ent1
self.ent2 = ent2
self.dtype = type
print( "Building features for matrix %s..." % name)
self.data, self.size = buildPytorchFeatsHT(data, ent1, ent2, dtype)
#print (list(self.data.items())[:3])
print ("Size: ", self.size)
@dispatch(str, NX.Entity, NX.Entity, coo_matrix)
def __init__(self, name:str, ent1:NX.Entity, ent2:NX.Entity, data:coo_matrix) :
"""One of the alternative constructors for the DataMatrix class.
Parameters
----------
name: str
Name of the data matrix
ent1 : Entity
Entity object representing the object on the dimension 0
ent2: Entity
Entity object representing the object on the dimension 1
data: coo_matrix
scipy.sparse.coo_matrix containing the sparse elements and in the matrix describing the relation.
Returns
-------
"""
self.name = name
self.ent1 = ent1
self.ent2 = ent2
self.dtype = type
print( "Building features for matrix %s..." % name)
d = {}
data = data.todok()
for i in data.items():
d[i[0]] = i[1]
self.data, self.size = buildPytorchFeatsHT(d, ent1, ent2, data.dtype)
#print (list(self.data.items())[:3])
print ("Size: ", self.size)
@dispatch(str, NX.Entity, NX.Entity, np.ndarray)
def __init__(self, name:str, ent1:NX.Entity, ent2:NX.Entity, data:np.ndarray) :
"""
One of the alternative constructors for the DataMatrix class.
Parameters
----------
name: str
Name of the data matrix
ent1 : Entity
Entity object representing the object on the dimension 0
ent2: Entity
Entity object representing the object on the dimension 1
data: numpy.ndarray (matrix)
Numpy matrix containing the (dense) describing the relation between ent1 and en2.
Returns
-------
DataMatrix object
"""
self.name = name
self.ent1 = ent1
self.ent2 = ent2
assert len(ent1) == data.shape[0], "ERROR: ent1 len is "+str(len(ent1))+" but data.shape[0] is"+str(data.shape[0])
assert len(ent2) == data.shape[1], "ERROR: ent2 len is "+str(len(ent2))+" but data.shape[0] is"+str(data.shape[0])
print( "Building features for matrix %s..." % name)
self.data, self.size = buildPytorchFeatsHTfromNumpy(data, self.ent1, self.ent2)
#print (list(self.data.items())[:3])
print ("Size: ", self.size)
@dispatch(str, np.ndarray)
def __init__(self, name:str, data:np.ndarray, dtype:type) :
"""Simplest possible constructor for the DataMatrix class. Entities are inferred.
Parameters
----------
name: str
Name of the data matrix
data: numpy.ndarray (matrix)
Numpy matrix containing the (dense) describing the relation between ent1 and en2.
Returns
-------
"""
self.name = name
self.ent1 = NX.Entity(self.name+"_0", list(range(data.shape[0])), dtype = dtype)
self.ent2 = NX.Entity(self.name+"_1", list(range(data.shape[1])), dtype = dtype)
print( "Building features for matrix %s..." % name)
self.data, self.size = buildPytorchFeatsHTfromNumpy(data, self.ent1, self.ent2)
#print (list(self.data.items())[:3])
print ("Size: ", self.size)
[docs] def size(self):
"""
Function that return the size of the relation (number of elements in the matrix).
Returns
-------
Size of the relation in the DataMatrix object
"""
size = 0
for i in data.items():
size += len(i[1][0])
print ("Size: ", size)
return size
@dispatch( str)
def __init__(self, path):
"""Constructor that reads the DataMatrix from a previously serialized DataMatrix object.
Parameters
----------
path: str
Path of the serialized DataMatrix
Returns
-------
"""
print( "Loading %s..." % path)
start = time.time()
store = cPickle.load(open(path))
stop = time.time()
try:
store["name"]
store["ent1"]
store["ent2"]
store["data"]
except:
print ("ERROR: wrong format, check file content")
exit(1)
self.name= store["name"]
self.ent1 = store["ent1"]
self.ent2 = store["ent2"]
self.data = store["data"]
print ("Done in %.2fs." % (stop-start))
[docs] def standardize(self):
"""
Method that standardizes the matrix with the formula x' = (x - mu)/s, where mu is the mean and s is the standard deviation.
Returns
-------
None
"""
l = []
for i in self.data.values():
l += list(i[1])
#print l[:100]
#print len(l)
mu = np.mean(l)
s = np.std(l)
print ("mu = %f, s= %f" % (mu, s))
tmp = []
for i in self.data.items():
res = (i[1][1]-mu)/s
#print res
#raw_input()
self.data[i[0]] = [i[1][0], res]
tmp += res.tolist()
#print len(tmp)
#print tmp[:100]
tmu = np.mean(tmp)
ts = np.std(tmp)
#print tmu, ts
assert abs(tmu) < 0.0001
assert abs(ts-1) < 0.0001
[docs] def toHashTable(self)-> dict:
"""
Method that returns an hash table (dict) containing the DataMatrix data.
Returns
-------
dict
"""
db = {}
for p1 in self.data.items():
#print p1
i = 0
assert len(p1[1][0]) == len(p1[1][1])
while i < len(p1[1][0]):
db[tuple(sorted([self.ent1[p1[0]], self.ent2[int(p1[1][0][i])]]))] = p1[1][1][i]
i+=1
print( "Found %d entries" % len(db))
#print db.items()[:10]
return db
def dump(self, path= None):
if path == None:
print ("Storing...")
start = time.time()
store = {"name":self.name, "ent1":self.ent1, "ent2":self.ent2, "data":self.data}
cPickle.dump(store, open("marshalled/"+self.name+".nx", "w"))
stop = time.time()
print ("Done in %.2fs." % (stop-start))
else:
print ("Storing...")
start = time.time()
store = {"name":self.name, "ent1":self.ent1, "data":self.data}
cPickle.dump(store, open(path, "w"))
stop = time.time()
print ("Done in %.2fs." % (stop-start))
@dispatch(np.ndarray, NX.Entity, NX.Entity)
def buildPytorchFeatsHTfromNumpy(data:np.ndarray, domain1:NX.Entity, domain2:NX.Entity) -> (dict, int):
""" This function produces the data structure that is internally used to pass training data to the wrapper.fit function. This is now transparent to the user.
Parameters
----------
data: numpy.ndarray
Numpy matrix containing the matrix that represents the relation between ent1 and ent2
domain1 : Entity
Entity1
domain2 : Entity
Entity2
Returns
-------
Dict internally used to feed the data to the NX.Wrapper object.
The format is the following
featsHT = {domain1Name_numeric : [ numpy16_domain2Names_numeric, numpyX_labels ]}
"""
size = 0
x = {}
i = 0
while i < len(domain1):
x[i] = [[],[]]
i+=1
if domain1 == domain2:
print( " *** identified as self relation.")
selfRelation = True
else:
print (" *** identified as asymmetric relation.")
selfRelation = False
for i in range(0, len(domain1)):
for j in range(0, len(domain2)):
if selfRelation:
try:
tmp = tuple(sorted([i,j])) #probably dangerous here
except:
continue
else:
tmp = tuple([i,j])
x[tmp[0]][0].append(tmp[1])
x[tmp[0]][1].append(data[i,j])
xf = {}
for i in x.items():
size += len(i[1][0])
xf[i[0]] = [np.array(i[1][0], dtype=domain1.dtype), np.array(i[1][1], dtype=data.dtype)]
return xf, size
@dispatch(dict, NX.Entity, NX.Entity, np.dtype)
def buildPytorchFeatsHT(data:dict, domain1:NX.Entity, domain2:NX.Entity, relDtype:type)-> (dict, int):
""" This function produces the data structure that is internally used to pass training data to the wrapper.fit function. This is now transparent to the user.
Parameters
----------
data: dict {(ent1[i], ent2[j]): value[i,j]}
Dict in the following format: {(ent1[i], ent2[j]): value[i,j]} containing the matrix that represents the relation between ent1 and ent2. This format can be used to input sparse matrices
domain1 : Entity
Entity1
domain2 : Entity
Entity2
relDtype : numpy.dtype
The smallest np.dtype sufficient to represent the values in the matrix.
Returns
-------
Dict internally used to feed the data to the NX.Wrapper object.
The format is the following
featsHT = {domain1Name_numeric : [ numpy16_domain2Names_numeric, numpyX_labels ]}
"""
size = 0
x = {}
i = 0
while i < len(domain1):
x[i] = [[],[]]
i+=1
if domain1 == domain2:
print( " *** identified as self relation.")
selfRelation = True
else:
print (" *** identified as asymmetric relation.")
selfRelation = False
for i in data.items():
if selfRelation:
try:
tmp = tuple(sorted([domain1[i[0][0]], domain2[i[0][1]]])) #probably dangerous here
except:
continue
else:
tmp = tuple([domain1[i[0][0]], domain2[i[0][1]]])
x[tmp[0]][0].append(tmp[1])
x[tmp[0]][1].append(i[1])
xf = {}
for i in x.items():
size += len(i[1][0])
xf[i[0]] = [np.array(i[1][0], dtype=domain1.dtype), np.array(i[1][1], dtype=relDtype)]
return xf, size