import numpy as N
import re, pylab, csv

def convert_search(col,missing):
	"""
	Converting data in a string array to the appropriate type
	"""

	# convert missing values to nan
	if missing:
		col[col == ''] = 'nan'; col[col == '.'] = 'nan'

	# find out if the data column is an int, float, date, or string
	try: 
		# if a missing value is present int variables will be up-cast to float
		return col.astype('int')							# converting to integer
	except ValueError: 
		try: 
			return col.astype('float')						# converting to float
		except ValueError: 
			try:
				return pylab.datestr2num(col)			# converting the dates column to a date-number
			except ValueError: 
				col[col == 'nan'] = ''
				return col

def load_search(reader, missing, varnm, cols):

	# putting the data in an array
	data = N.array([i for i in reader])

	# converting the data to an appropriate data type
	data = [convert_search(i,missing) for i in data.T]

	# collecting datatypes and variable names for the different columns
	descr = [(varnm[i],data[i].dtype) for i in xrange(cols)]

	# converting to a recarray
	return N.rec.fromarrays(data, dtype=descr)

def load_spec(reader, varnm, types):

	# formatting the types properly
	types = [re.sub('str','S',i) for i in types]			# making sure there are no spaces in the name
	types = [re.sub('date','pylab.datestr2num',i) for i in types]			# making sure there are no spaces in the name

	# for the iterator (S\d* --> str)
	types1 = [re.sub('S\d*','str',i) for i in types]			# making sure there are no spaces in the name

	# for dtype (pylab.datestr2num --> float)
	types2 = [re.sub('pylab.datestr2num','float',i) for i in types]			# making sure there are no spaces in the name

	# generating code for an iterator given the data types provided, kind of like a SAS macro
	ident=" "*4
	gencode="\n".join([
		"def iter(reader):",
		ident+"for row in reader:",
		ident*2+",".join(varnm)+" = row",
		ident*2+"yield (" + ",".join(["%s(%s)" % (f,v) for f,v in zip(types1,varnm)])+")",
		])

	# compiling that genarated code into a callable function
	exec(compile(gencode,'<string>','exec'))

	# making a list of tuples for the rec array
	types = list((i,j) for i, j in zip(varnm, types2))

	try:
		return N.fromiter(iter(reader),dtype = types)
	except:
		print "The data-types provided are not working correctly. Try changing 'int' to 'float'. Also check if there are missing values."
		sys.exit()

def load(fname,delim = ',',has_varnm = True, varnm = [], types = [],  missing = False, prn_report = True):
	"""
	Loading data from a file using the csv module. Returns a list of arrays.
	Possibly with different types.
	"""

	# creating a file handle
	f = open(fname,'rb')

	# initialise number of columns in the data
	cols = 0
	
	# if no variable names are specified, create some
	if not varnm and not has_varnm:
		cols = len(f.next().split(delim))
		f.seek(0)

		varnm = ['var%s' % str(i+1) for i in xrange(cols)]
	elif not varnm:
		# making sure that the variable names contain no leading or trailing spaces
		varnm = f.next().split(delim)

	if not cols:
		varnm = [i.strip() for i in varnm]
		varnm = [re.sub('\s','_',i) for i in varnm]			# making sure there are no spaces in the name
		cols = len(varnm)

	# loading the rest of the data
	reader = csv.reader(f, delimiter = delim)

	# use different conversion method if types are provided
	if types:
		data = load_spec(reader, varnm, types)
	else:
		data = load_search(reader, missing, varnm, cols)

	# load report
	if prn_report:
		print "##########################################\n"
		print "Loaded file: %s\n" % fname
		print "Nr obs: %s\n" % data.shape[0]
		print "Variables and datatypes:\n"
		for i in data.dtype.descr:
			print "Varname: %s, Type: %s, Sample: %s" % (i[0], i[1], str(data[i[0]][0:3]))
		print "\n##########################################\n"
		
	return data

if __name__ == '__main__':

	import csv, sys

	# creating data
	data = [['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7'],
			['1','3','1/97','1.12','2.11','001','bla1'],
			['1','2','3/97','1.21','3.12','002','bla2'],
			['2','1','2/97','1.12','2.11','003','bla3'],
			['2','2','4/97','1.33','2.26','004','bla4'],
			['2','2','5/97','1.73','2.42','005','bla15']]

	# saving data to csv file
	f = open('testdata_with_varnm.csv','wb')
	output = csv.writer(f)
	for i in data:
		output.writerow(i)
	f.close()

	# saving data to csv file
	f = open('testdata_without_varnm.csv','wb')
	output = csv.writer(f)
	for i in data[1:]:
		output.writerow(i)
	f.close()

	# opening data file with variable names
	ra = load('testdata_with_varnm.csv')	

	# opening data file without variable names
	ra = load('testdata_without_varnm.csv',has_varnm = False)	

	# opening data file without variable names, giving your own variable names
	ra = load('testdata_without_varnm.csv',varnm = ['v1','v2','v3','v4','v5','v6','v7'])	

	# opening data file without variable names, giving your own types
	ra = load('testdata_with_varnm.csv',types = ['int','int','date','float','float','str10','str10'])	

	# opening data file without variable names, giving your own types
	ra = load('testdata_without_varnm.csv',varnm = ['v1','v2','v3','v4','v5','v6','v7'], types = ['int','int','date','float','float','str10','str10'])	

	# opening data file without variable names, giving your own types
	ra = load('testdata_without_varnm.csv',has_varnm = False, types = ['int','int','date','float','float','str10','str10'])	

	# note: put 'from load import * in myutils/__init__.py file. Maybe then you can do 'import myutils as U; ra = U.load(...)
