#!/usr/bin/env python
# -*- coding: utf-8 -*-

''' Seek in a directory for the same picture with different names.
'''

import sys
import os
from PIL import Image
import ImageFile
import numpy
import math
from collections import Iterable


SIZE = (8, 8)
PATH = '/media/f/photos/campus'

class PictureIdentifier(object):
    ''' Identifies wether 2 pictures are the same via average hash algorithm
    '''
    def __init__(self):
        object.__init__(self)

    def average_hash(self, name):
        try:
            im = Image.open(name)
        except IOError, e:  # Ignore files not recognized
            print 'IOError', e
            return -1
        print 'Hashing', name
        rim = im.resize(SIZE)
        '''
        im.convert(mode) => image

        Returns a converted copy of an image.

        When translating from a palette image, this translates pixels through
        the palette. If mode is omitted, a mode is chosen so that all
        information in the image and the palette can be represented without a
        palette.

        When from a colour image to black and white, the library uses the ITU-R
        601-2 luma transform:

                L = R * 299/1000 + G * 587/1000 + B * 114/1000

        im.convert(mode, matrix) => image

        Converts an "RGB" image to "L" or "RGB" using a conversion matrix. 
        '''
        riml = rim.convert('L')
        data = list(riml.getdata())     # get the color of each pixel in list

        # Calculate the mean of color list above
        mean_value = numpy.mean(data)
        # Binarize the color list, hash value string
        hvl = ''.join([str(int(i > mean_value)) for i in data])

        return hvl

    def hamming_distance(self, n1, n2):
        ''' Calculate the Hamming Distance of 2 integers or int lists. '''

        d = 0
        if (type(n1) == str and type(n2) == str) or \
           (type(n1) == list and type(n2) == list):
            for i in range(len(n1)):
                if n1[i] != n2[i]:
                    d += 1
        else:
            raise TypeError, 'n1 and n2 should be either long or long list.'

        return d

    def is_same_picture(self, f1, f2):
        ''' Compare wether the two pictures are the same via file names or
        average has values.
        '''

        d = 0
        # Color list
        if isinstance(f1, Iterable) and isinstance(f2, Iterable):
            ah1 = self.average_hash(f1)
            ah2 = self.average_hash(f2)
            d = self.hamming_distance(ah1, ah2)
        # Hash value
        elif type(f1) == list and type(f2) == list:
            d = self.hamming_distance(f1, f2)
        else:
            raise TypeError, '(%s, %s) is not Iterable nore int' % (f1, f2)

        if d < 3:
            print '==>Same: %s --- %s' % (f1, f2)
        else:
            print '==>Different: %s --- %s' % (f1, f2)


class PictureSeeker(PictureIdentifier):
    ''' Seek in a directory for the same picture with different names.
    '''
    def __init__(self):
        PictureIdentifier.__init__(self)

    def get_identical_picture(self, path=PATH):
        ''' Find identical pictures in path.
        '''

        # Average hash files
        hash_dict = {}
        for (root, dirs, files) in os.walk(path):
            #print root, '--', dirs, '--', files
            for f in files:
                hash_dict[f] = self.average_hash(root + '/' + f)
                if hash_dict[f] == -1:
                    del hash_dict[f]

        # Find duplicate hashes
        hash_orig = hash_dict.values()
        hash_values = list(set(hash_orig))

        # Print file names of duplicate picture
        keys = hash_dict.keys()
        for i in range(len(keys)):
            for k in keys[i + 1:]:
                if self.hamming_distance(hash_dict[keys[i]], hash_dict[k]) < 2:
                    print keys[i], k,'---', hash_dict[k]

def main():
    ps = PictureSeeker()
    ps.get_identical_picture()

if __name__ == '__main__':
    main()

