Source code for NXTfusion.NXDatasetUtils

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  pytorchDatasetUtils.py
#  
#  Copyright 2018 Daniele Raimondi <daniele.raimondi@vub.be>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#  
#  
import pickle as cPickle
import time
import torch as t
import numpy as np
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

class PredictionDatasetSide(Dataset):
	"""
	:meta private:
	"""
	def __init__(self, x, sx1, sx2):		
		#if sx1 != None and len(sx1) > 0:
		#	assert len(x) == len(sx1) 
		#if sx2 != None and len(sx2) > 0:
		#	assert len(sx2) == len(x)
		self.x = x # [(i,j), (i,k), ...]
		self.sx1 = sx1
		self.sx2 = sx2
				
	def __getitem__(self, idx): 
		"""
		:meta private:
		"""

		tmp = self.x[idx]
		sx1 = []
		sx2 = []
		if self.sx1 != None:
			sx1 = self.sx1[tmp[0]]
		if self.sx2 != None:
			sx2 = self.sx2[tmp[1]]
		return tmp[0], tmp[1], sx1, sx2		
	
	def __len__(self):
		return len(self.x)


[docs]class PredictionDataset(Dataset):
    
	def __init__(self, x, label = True):		
		self.x = x # [(i,j), (i,k), ...]
				
	def __getitem__(self, idx): 
		tmp = self.x[idx]
		return tmp[0], tmp[1]
		
	def __len__(self):
		return len(self.x)

[docs]class SideDataset(Dataset):
	def __init__(self, side):
		assert type(side) == {}
		self.side = side
		self.estSize = len(self) * len(self.values()[0])
	
	def __getitem__(self, idx):
		return self.side[idx]

	def __len__(self):
		return len(self.side)

[docs]class SubDataset(Dataset):
	"""
	Within the NNwrapper, during training, batches need to be rapidly provided for all the MetaRelations in the ERgraph and for each Relation in every MetaRelation. To do so, the NNwrapper.processDatasets function builds an internal Dataset structure that mimicks the structure of the input ERgraph. In this case, MetaDataset correspond to MetaRelation, and each Relation in a MetaRelation is represendet by a SubDataset in the corresponding MetaDataset.

	Nevertheless, this is internal and it is transparent to the user.
	:meta private:
	"""
[docs]	def __init__(self, xht, typep="binary"):		
		"""
		Constructor method for the SubDataset class. It puts in a pytorch-friendly structure the matrix corresponding to a target Relation, by transforming its DataMatrix into a pytorch Dataset.

		Parameters
		----------
		xht : dict
			Dict used to represent the matrix/relation data within a DataMatrix object
		type : str 
			String specifying the type of the prediction. It must be "regression" or "binary".

		Returns
		-------
		
		"""
		#print type(xht)
		assert type(xht) == dict
		self.xht = xht #xht = {p1:[(positions),(values)]}
		empty = 0
		for i in self.xht.items():
			if len(i[1][0]) == 0:
				#print i[0]
				empty += 1
				#print i[1]
		
		print ("Empty rows: ",empty) 
		#raw_input()
		self.estSize = self.countInstances()
		self.type = typep	
		self.balance = self.countBalance()
		
[docs]	def countBalance(self):
		if self.type != "binary":
			return "regression"
			#raise Exception("CAlling count balance on regression problem!!!")
		r = [0,0]
		for i in self.xht.values():
			tmp = sum(i[1])
			r[0] += tmp
			r[1] += len(i[1])-tmp
		return r
		
[docs]	def countInstances(self):
		r = 0
		for i in self.xht.values():
			r += len(i[0])
		return r
		
	def __getitem__(self, idx):
		tmp = self.xht[idx]
		return tmp
		#return (np.array(tmp[0], dtype=np.int16), np.array(tmp[1], dtype=np.int8))
	
[docs]	@staticmethod
	def load(name):
		tmp = cPickle.load(open(name))
		vt = None
		if "binary" in name:
			vt = "binary"
		elif "regression" in name:
			vt = "regression"
		return SubDataset(tmp, typep=vt)
	
[docs]	def dump(self, name):
		print( "Dumping...")
		t1 = time.time()
		cPickle.dump(self.xht, open(name, "w"))
		t2 = time.time()
		print ("Stored in: %s (%.2fs)" % ( name, t2-t1))
		
	def __len__(self):
		return len(self.xht)		

[docs]class MetaDataset(Dataset): ###########cercato di ottimizzre lo spazio con numpy ma al collate arriva roba strana, da fixare !!!!!!!!!!#######################
	"""
	Class that represents the MetaRelations in the NNwrapper internal Dataset-based version of the ERgraph used for allowing a fast and consistent multi-task mini batching. Each MetaDataset can contain many SubDatasets, and when asked it provides a minibatch sampling from all of them in parallel.
	"""
[docs]	def __init__(self, datasetList, domain1, domain2, name, ignore_index, side1 = None, side2 = None):
		"""
		Constructor method for the MetaDataset class. It puts in a pytorch-friendly structure the data corresponding to a target MetaRelation, by storing several SubDataset (each corresponding to a Relation/DataMatrix/matrix).

		Parameters
		----------
		datasetList : list of SubDatasets
			List of Subdatasets. Each SubDataset corresponds to a Relation. The MetaDataset thus corresponds to a MetaRelation.
		domain1 : NX.Entity
			First entity involved in this MetaRelation (all the Relations in it are between the same entities)
		domain2 : NX.Entity
			Second entity involved in this list of relations (MetaRelation).
		name: str
			Name of the corresponding MetaRelation
		ignore_index: int
			Value corresponding to missing values. Used to allow fast runs on GPUs and minibatching even with different percentages of missing values among the Relations/SubDatasets in the same MetaRelation/MetaDataset.

		Returns
		-------
		
		"""
		self.name = name
		self.ignore_index = ignore_index
		self.side1 = side1
		self.side2 = side2
		self.datasetList = datasetList
		self.domain1 = domain1
		self.domain2 = domain2

[docs]	def getTypes(self):
		r = []
		for i in self.datasetList:
			r.append(i.type)
		return r

[docs]	def countBalance(self):
		r = []
		for i in self.datasetList:
			r.append(i.balance)
		return r

[docs]	def countInstances(self):
		r = []
		for i in self.datasetList:
			r.append(i.estSize)
		return r

[docs]	def getEstSize(self):
		return sum(self.countInstances())

[docs]	def getEstBatchSizeForXsamples(self, targetDomain1, samplesPerBatch):
		assert samplesPerBatch > 0
		perc = samplesPerBatch / float(len(targetDomain1))
		res = int(max(1, perc * float(len(self.domain1))))
		print (" Foreseen batch size: ", res)
		return res

[docs]	def getEstBatchSizeForXsamples2(self, numSamples): #future daniele, pay attention to this, may be working bady
		assert numSamples > 10
		tot = self.getEstSize()/len(self.domain2)
		res = max(1,tot / numSamples)
		print( " Foreseen batch size: ", res)
		return res

	def __getitem__(self, idx):
		tmp = []
		for d in self.datasetList:
			tmp.append(d[idx])
		tmp = self.mergeDataSimple(tmp, idx)
		#print tmp
		#raw_input()
		return tmp
	
[docs]	def mergeDataSimple(self, v, idx):
		x1 = []
		x2 = []
		y = []
		xside1 = []
		xside2 = []
		#da sistemare qui sotto
		for count, ds in enumerate(v):
			#print count
			assert len(ds[0]) == len(ds[1]) and len(ds) == 2
			tmp1 = np.array([idx]*len(ds[0]), dtype=np.int32) #BEWARE! you might have more instances TODO
			#print "x:", tmp1.shape, len(ds[0])
			x1.append(tmp1)
			x2.append(ds[0])
			tmp = np.ones((len(ds[0]), len(v)))* self.ignore_index
			tmp[:,count] = ds[1]

			if self.side1 != None and len(ds[0]) > 0:
				#print self.side1[idx]
				#print len(ds[0])
				for s in xrange(0, len(ds[0])):
					xside1.append( self.side1[idx])
				
			if self.side2 != None and len(ds[0]) > 0:
				tmp1 = []
				for s in ds[0]:
					xside2.append(self.side2[s])
			y.append(tmp)
			#print tmp.shape
		if self.side1 != None and len(xside1) > 0:
			xside1 = np.vstack(xside1)
		else:
			xside1 = []
		if self.side2 != None and len(xside2) > 0:
			xside2 = np.vstack(xside2)
		else:
			xside2 = []
		#print xside2
		if self.side1 != None or self.side2 != None:
			#print len(xside1), len(xside2)
			#print xside1.shape, xside2.shape
			return np.hstack(x1), np.hstack(x2), np.vstack(y), xside1, xside2
		else:		
			return np.hstack(x1), np.hstack(x2), np.vstack(y)

	def __len__(self):
		return len(self.domain1)