/* File: codhuff.c
   Author: NJA
   Based on original code by David Bourgin (David.Bourgin@ufrima.imag.fr)
   
   Creation date: 7/2/94
   Last update: 8/6/00
   
   Purpose: Create a canonical Huffman compressed character table 
            and generate all the associated tables for instant access to a
            character given by its ucs2 code.
            
   !!! Many bugs remaining !!!
   
   Sources & Documents:
   Efficient Decoding of Prefix Codes, pp.10-12
   D. S. Hirschberg and D. A. Lelewer
   (For correct description of the method)
   
   DEFLATE Compressed Data Format Specification v1.3
   P. Deutsch
   (For building the Canonical Huffman codes)
   
   For simple (and somewhat inaccurate) intros
   http://www.arturocampos.com/ac_static_huffman.html
   http://www.compressconsult.com/huffman/
*/

#include <stdio.h>
/* For routines fgetc,fputc,fwrite and rewind */
#include <memory.h>
/* For routines memset,memcpy */
#include <malloc.h>
/* For routines malloc and free */
#include <stdlib.h>
/* For routines qsort et exit */
#include <math.h>
/* For routine log2. */
#include  <unistd.h>
/* For unix system calls (exec) */

//#define _DEBUG_
#define _ENCODER_
#include "huffman.h"
#include "lookup.h"


#define MAX_LENGTH 32


/* Macro implementing assertions */ 
#define Assert(cond,msg) {if(!(cond)) {puts(msg);exit(1);} }
/* Macro to print val in binary form */
#define fpBinary(file, val) for (i = 0; i < 8; i++){ \
  fprintf((file), "%d, ", ((val) & (128 >> i)) >> (7-i) );\
}


/* Number of bits written to output file */
unsigned char num_bits_written = 0;
unsigned int val_to_write = 0;

lookup_table lookup;

unsigned int weight[257];
unsigned long total;
static unsigned long numcol = 0;

unsigned char num_char_present[100];

/* Returned parameters: None
   Action: Write in the output stream the binary-coded value of 'bin_val'
   Errors: An input/output error could disturb the running of the program
*/
void write_bin_val(t_bin_val bin_val)
{ 
  unsigned char bin_pos,
                byte_pos;
                
#ifdef _DEBUG_
  unsigned int i, tmp;
#endif
  
  /* Compute the number of bits to write */
  byte_pos = (LENGTH(bin_val) + 7) >> 3;       /* return ceil( bytes ) */
  bin_pos = LENGTH(bin_val) & 7;               /* Nb of bits */
  
  if (bin_pos)
  {
    byte_pos--;
    /* Write the last few bits that do not group into a full byte */
    val_to_write = (val_to_write << bin_pos)|BITS(bin_val)[byte_pos];
    num_bits_written += bin_pos;
    if (num_bits_written >= 8)
    {
      num_bits_written -= 8;
#ifndef _DEBUG_
      fprintf(dest_file,"%d, ", val_to_write >> num_bits_written);
#else
      fpBinary(dest_file, val_to_write >> num_bits_written);
#endif
      val_to_write &= ((1 << num_bits_written) - 1);
      if (++numcol % 8) fprintf(dest_file, "\n");
    }
  }
  
  while (byte_pos)
  { 
    /* Write a full byte */
	  byte_pos--;
    val_to_write = (val_to_write << 8)|BITS(bin_val)[byte_pos];
#ifndef _DEBUG_
    fprintf(dest_file,"%d, ", val_to_write >> num_bits_written);
#else
    fpBinary(dest_file, val_to_write >> num_bits_written);
#endif
    val_to_write &= ((1 << num_bits_written) - 1);
    if (++numcol % 8) fprintf(dest_file, "\n");
  }
}


/* Returned parameters: None
   Action: Fills the last byte to write in the output stream with zero values
   Errors: An input/output error could disturb the running of the program
*/
void pad_byte(void)
{ 
  if (num_bits_written)
  {
#ifndef _DEBUG_
    fprintf( dest_file, "%d, ", val_to_write << (8 - num_bits_written) );
#else
    int i;
    fpBinary(dest_file, val_to_write << (8 - num_bits_written));
#endif
  if (++numcol % 8) fprintf(dest_file, "\n");
  }
}


/* 
  Insertion sort (simple and fast for arrays < 1000)
*/
void insertSort(p_tree *a,  long lb,  long ub) 
{
    p_tree t;
    long i, j;
#define GT(a, b) ((a)->weight > (b)->weight)
   /**************************
    *  sort array a[lb..ub]  *
    **************************/
    for (i = lb + 1; i <= ub; i++) {
        t = a[i];

        /* Shift elements down until */
        /* insertion point found.    */
        for (j = i-1; j >= lb && GT(a[j], t); j--)
            a[j+1] = a[j];

        /* insert */
        a[j+1] = t;
    }
#undef GT
}

/* Returned parameters: None
   Action: Suppresses the allocated memory for the 'tree'
   Errors: None if the tree has been build with dynamical allocations!
*/
void  del_tree(p_tree tree)
{ 
  if (tree!=NULL)
   { 
  	  del_tree(LEFT_BRANCH(tree));
  	  del_tree(RIGHT_BRANCH(tree));
  	 free(tree);
   }
}

/* Returned parameters: Returns a comparison status
   Action: Returns a negative, zero or positive integer depending on the weight
   of 'tree2' is less than, equal to, or greater than the weight of 'tree1'
   Errors: None
*/
int weight_tree_comp(const p_tree tree1, const p_tree tree2)
{ 
  return (WEIGHT_OF_NODE(tree2) - WEIGHT_OF_NODE(tree1));
}

/* Returned parameters: Returns a tree of encoding
   Action: Generates an Huffman encoding tree based on the data from the stream
           to compress.
           
   Scheme : a Huffman tree
   Errors: If no memory is available for the allocations then a 'BAD_MEM_ALLOC'
           exception is raised
*/

p_tree  Build_Huffman_Tree()
{ 
  register unsigned int charac;
  p_tree  Huff_tree[257],
         ptr_fictive_node;

  int weight_tree_comp(const p_tree, const p_tree);
  void bsort(p_tree*, int);
  
  /* Reset the all the leaves to 0 */
  for ( charac = 0; charac <= 256; charac++)
  {
    
    Huff_tree[charac] = (p_tree) malloc(sizeof(t_tree));
    if ( Huff_tree[charac] == NULL )
    {
      for (; charac; charac--)
	    {
	  	  free( Huff_tree[charac-1]);
      }
	    exit(BAD_MEM_ALLOC);
    }
    
    NODE_BYTE(Huff_tree[charac])     = charac;
    WEIGHT_OF_NODE(Huff_tree[charac])= 0;
    LEFT_BRANCH(Huff_tree[charac])   = NULL;
    RIGHT_BRANCH(Huff_tree[charac])  = NULL;
  }
  
  memset (weight, 0, 257*sizeof(unsigned int));  
  /* Compute the weight (that is: the occurences) of each leaf 
     in the data to compress */
  if (!end_of_data())
  {
    /* Count the occurences of each character -> weight */
    while (!end_of_data())
    {
      charac = read_byte();
      WEIGHT_OF_NODE( Huff_tree[charac]) += 1;
    }
    WEIGHT_OF_NODE( Huff_tree[256]) = 1;        /* End of file symbol */

    total = 0;
    /*save weights to compute statistics later */
    for (charac = 0; charac < 256; charac++)
    {
      weight[charac] = WEIGHT_OF_NODE(Huff_tree[charac]);
      total += weight[charac];
    }
    
    /* Sort the leaves depending on the weight of each character */
    //qsort(Huff_tree, 257, sizeof(p_tree), weight_tree_comp);
    insertSort(Huff_tree, 0, 256);
    
    /* Discard the characters that are absent of the data */
    for (charac = 0; charac < 256; charac++)
    {
      if (WEIGHT_OF_NODE(Huff_tree[charac]) == 0) free( Huff_tree[charac]);
    }
    
    charac++; /* Necessary because of the first charac-- that follows */  
    
    /* This is where the Huffman tree is really built, level by level */
    /* From there, 'charac' gives the number of different bytes with a null
       occurrence in the stream to compress */       
   
    while ( charac ) 	 /* Looks up (charac+1)/2 times the occurrence table to 
                        link the nodes in an unique tree */
    {
      ptr_fictive_node = (p_tree) malloc(sizeof(t_tree));
      if ( ptr_fictive_node == NULL )
      {
        for (charac = 0; charac <= 256; charac++)
        {
          del_tree( Huff_tree[charac]);
        }
        exit(BAD_MEM_ALLOC);
      }
      
      /* Build a node with the weight of its two children */
      NODE_BYTE(ptr_fictive_node)     = NOT_A_LEAF;
      
      charac--;
      WEIGHT_OF_NODE(ptr_fictive_node) = WEIGHT_OF_NODE( Huff_tree[charac]);
      LEFT_BRANCH(ptr_fictive_node)    =  Huff_tree[charac];
    	
    	if (charac)
    	{
    	  charac--;
    	  WEIGHT_OF_NODE(ptr_fictive_node) += WEIGHT_OF_NODE( Huff_tree[charac]);
    	  RIGHT_BRANCH(ptr_fictive_node) =  Huff_tree[charac];
      }
      else
      {
        RIGHT_BRANCH(ptr_fictive_node) =  NULL;
      }
      Huff_tree[charac] = ptr_fictive_node;
      /* Sort the nodes of this level with respect to their weight */
      //qsort(Huff_tree, charac+1, sizeof(p_tree), weight_tree_comp);
      insertSort(Huff_tree, 0, charac);
      
      if (charac)		/* Is there another node at this level of the tree? */
    	 charac++; 		/* If yes, handle the fictive node */
    }
  }
  return (*Huff_tree);
}




/* Returned parameters: The array 'Hcodes' may be modified
   Action: Compute the length of each character in the table. 
   'val_code' gives the encoding for the current node of the tree
   Errors: None
*/
void compute_lengths(p_tree tree, t_bin_val Hcodes[257], t_bin_val* code_val)
{ 
  register unsigned int i;
  t_bin_val tmp_code_val;

  if (NODE_BYTE(tree) == NOT_A_LEAF)
  {
    if (LEFT_BRANCH(tree)!= NULL)       /* The left sub-tree starts with a 1 */
    {
      tmp_code_val = *code_val;
      LENGTH(*code_val)++;
      compute_lengths(LEFT_BRANCH(tree), Hcodes, code_val);
      *code_val = tmp_code_val;
    }
    if (RIGHT_BRANCH(tree) != NULL)    /* The right sub-tree starts with a 1 */
    {
      tmp_code_val = *code_val;
      LENGTH(*code_val)++;
      compute_lengths(RIGHT_BRANCH(tree), Hcodes, code_val);
      *code_val = tmp_code_val;
    }
  }
  else    /* The node is a leaf */
  {
	  Hcodes[NODE_BYTE(tree)] = *code_val;
  }
}

/* Returned parameters: The data in 'Hcodes' will be modified
   Action: Stores the encoding tree as a binary encoding table to 
   speed up the access by calling compute_lengths
   Errors: None
*/
void create_codes_table(p_tree tree, 
                        t_bin_val Hcodes[257],
                        t_bin_val Cancode[257] )
{			
  unsigned long average;
  int len, min_length, max_length;
  unsigned long base_code[MAX_LENGTH], code, mask, tmp;
  unsigned int bl_count[MAX_LENGTH];
  unsigned int charac, max_char, count, indx;
  unsigned int ii, jj;
  unsigned char byte[4];
  t_bin_val code_val;

  char *unixsort = "sort -bdni Hcodes_t>Hcodes_t.h";
  
  memset( (unsigned char *)&code_val, 0, sizeof(code_val) );
  for (charac = 0; charac < 257; charac++)
  {
    memset((unsigned char *)BITS(Hcodes[charac]), 0, NUM_BITS);
    LENGTH(Hcodes[charac]) = 0;
    
    memset((unsigned char *)BITS(Cancode[charac]), 0, NUM_BITS);
    LENGTH(Cancode[charac]) = 0;
  }          
  for (len = 0; len <= MAX_LENGTH; len++)
  {
    base_code[len] = (unsigned long) 0;
    bl_count[len] = 0;
  }

  compute_lengths(tree, Cancode, &code_val);
  
  /* Some test values (see Hirschberg & Lelewer) */
  /*
  LENGTH(Cancode[97])  = 2;
  LENGTH(Cancode[98])  = 2;
  LENGTH(Cancode[99])  = 3;
  LENGTH(Cancode[100]) = 3;
  LENGTH(Cancode[101]) = 3;
  LENGTH(Cancode[102]) = 4;
  LENGTH(Cancode[103]) = 4;
  */
  
  /* Print statistics */
  average = 0;
  len = max_length = 0;
#ifdef _DEBUG_
  puts("  i\tOccur\tLen\ti\tOccur\tLen\n");
#endif
  for ( charac = 0; charac  <= 256; charac++)
  { 
    len = LENGTH(Cancode[charac]);
    max_length = ( len && len >= max_length ) ? len : max_length;
    min_length = ( len && len <= min_length ) ? len : min_length;
    average += len * weight[charac];
    /* entropy += 1/weight[charac] * log2 (1/weight[charac]); */
#ifdef _DEBUG_    
    printf ("%3d  %5d\t%2d\t", charac, weight[charac], len );
	  if (charac % 2) puts("\n");
#endif
  }
  printf( "\n%li chars in source file\n", total);
  printf( "Min/Max code length: %d, %d\n", min_length, max_length);
  printf( "Average length: %f  bits/byte\n", (float) average/total);
  /* printf( "Entropy (info. content) : %f bits\n",  entropy ); */
  
  /* ---- BuildCanonical Huffman codes ---- */

  /* Step 1 : Count the number of codes for each code length. */
  max_char = 0;
  for (len = min_length; len <= max_length; len++)
  {
    for ( charac = 0; charac < 256; charac++)
    {
      if ( LENGTH(Cancode[charac]) == len )
      {
        bl_count[len]++;
        max_char =  charac > max_char ? charac : max_char; 
      }
    }
  }

  /* Step 2 : Find the binary value of the smallest code of each length */
  code = (unsigned long) 0;
  for (len = min_length; len <= max_length; len++ )
  {
    code = (unsigned long) ( code + bl_count[len - 1] ) << 1;
    base_code[len] = code;
  }

  Assert (code + bl_count[max_length]-1 == (1<<max_length)-1,
            "inconsistent bit counts. Exiting");

  /* Step 3 : Assign binary values to all codes, using consecutive values for
     all codes of the same length, with the base value determined at step 2.
     Output the values to 'Hcodes' file. This file must be sorted using Unix
     sort in order of increasing length and value.
  */
  fprintf(code_file, "const unsigned long Hcode[] =\n{\n");
  for (charac = 0; charac <= max_char; charac++)
  {
    len = LENGTH(Cancode[charac]);
    if (len != 0)
    {
      /* Copy base_code[len] in Cancode[charac].bits 
         with 'justification on the left'. A simple
         memcpy does not suffice.
      */
      memset(byte, 0, sizeof(unsigned long));
      tmp = base_code[len];

      byte[3] = (unsigned char)(tmp & 0xFF) ;
      tmp >>= 8;
      byte[2] = (unsigned char)(tmp & 0xFF);
      tmp >>= 8;
      byte[1] = (unsigned char)(tmp & 0xFF);
      tmp >>= 8;
      byte[0] = (unsigned char)(tmp & 0xFF);
      for (ii = 0; ii <= 3; ii++)
      {
        if (byte[ii] == 0) continue;
        for (jj = ii; jj <= 3; jj++)
        {
          Cancode[charac].bits[jj-ii] = byte[jj];
        }
      }
      
      /* Write the current code and do the next one */
      fprintf(code_file, "\n\t%ld,\t/* %d */", base_code[len], len);
      base_code[len]++;
    }
  }
   
  fclose(code_file);
  system(unixsort);  /* Sort the values in the file */
  code_file = fopen("Hcodes_t.h","a");
  fprintf(code_file, "\n};\n");

  system("rm -f Hcodes_t");


  /* Create array of index of beginning of codes of each length */
  fprintf(code_file, "const unsigned char Hcode_indx[] = \n{");
  indx = 0;
  fprintf(code_file, "\n\t0,");
  for (len = min_length; len < max_length; len++)
  {
    indx += bl_count[len];
    fprintf(code_file, "\n\t%vi d,", indx );
  }
  fprintf(code_file, "\n};");
  fclose(code_file);  
}
  
/*
  Generate rudimentary tables implementing the input data.
  These tables must be modified by hand and included
  in a .c or .h file for recompilation for ARM.
  
  Each table consists of a ucs2 row of input data, and
  32 bytes indicating the presence/absence of each
  character in the row, and 32 bytes indicating the number 
  of bits set to 1 for each presence byte.
*/
void generate_row_table()
{
  unsigned long code, possible_val;
  unsigned int row, col, prevrow;
  unsigned char byte;
  unsigned char nrows, numbits_set;
  unsigned char bit, jmp; 

#ifdef _DEBUG_
  unsigned int i;
#endif
  nrows = 0;
  prevrow = 0;
  numbits_set = 0;
  
  fscanf(ucs2_file, "%2x%2x", &row, &col);
  code = (row << 8) | col;
  possible_val = row << 8; 
       
  fprintf(rows_file, 
          "const unsigned char row[%d] = /* 81 * 32 */\n{\n", 81*32);
  fprintf(numbits_file,
          "const unsigned char bitsnum[%d] = /* 81 * 32 */\n{\n", 81*32);

  while ( !feof(ucs2_file) )
  {
    jmp = row - prevrow;
    if ( jmp )           /* If the row number has changed, */
    {                        /* create a new table.        */ 
    
      if ( bit >= 248 )      /* Don't forget the last byte */
      {
#ifndef _DEBUG_
        fprintf(rows_file, "%3d,", byte);
				fprintf(numbits_file, "%3d,", numbits_set);
#else
        fpBinary(rows_file, byte);
        fprintf(numbits_file, ", %3d,", numbits_set);
#endif
      }
      num_char_present[nrows] = numbits_set;
      possible_val = row << 8;
      bit = -1;
      byte = 0;
      numbits_set = 0;
      
      fprintf(rows_file, "\n\n/* %lx */", code);
      fprintf(numbits_file, "\n\n/* %lx */", code);

			nrows++;
    }
    
    do
    /* Print the presence table */
    {
      /* Write the presence bits: if the ucs2 value
         possible_val is absent, skip it and increment bit number. 
         Do this until it is equal to the code read in the file.
      */
      if ( !(++bit % 8) && bit )
      {
#ifndef _DEBUG_
        fprintf(rows_file, "%3d, ", byte);
				fprintf(numbits_file, "%3d, ", numbits_set);
#else
        fpBinary(rows_file, byte);
        fprintf(numbits_file, ", %3d, ", numbits_set);
#endif
        byte = 0;
      }
      
#ifndef _DEBUG_
      /* Every 16 presence bytes, carriage return */
      if ( !(bit % 128) )
      {
        fprintf(rows_file, "\n\t");
				fprintf(numbits_file, "\n\t");
      }
#else
      if ( !(bit % 8) )
      {
        fprintf(rows_file, "\n\t");
				fprintf(numbits_file, "\n\t");
      }
#endif

    } while ( ++possible_val <= code && bit < 256 );
    
    byte |= (128 >> (bit % 8));   /* Put the 1s and 0s...    */

    numbits_set++;              /* Count the bits set to 1 */
    prevrow = row;
    
    fscanf(ucs2_file, "%2x%2x", &row, &col);   /* Read next UCS2 code */
    code = (row << 8) | col;
  }
  
  fprintf(rows_file, "\n};\n\n");
	fprintf(numbits_file, "\n};");
	//system("cat numbits >> rows_table.h");
	//system("rm -f numbits");
  printf("%d rows in ucs2 file.\n", nrows);
}



/* Returned parameters: None
   Action: Compresses with Huffman method all bytes read by 
           the function 'read_byte'
   Errors: An input/output error could disturb the running 
           of the program
*/
void huffmanencoding()
{ 
  p_tree tree;

  unsigned int i, end, num_garbage = 0, sum_garbage = 0;
  unsigned int num_entries = 0;
  unsigned int offset = 0;
  unsigned long row_first = 0L;
  unsigned long row_firstaddress[80];
  unsigned int row_offset[80];

  
  unsigned int chinese_char_length; 
  t_bin_val encoding[257];
  t_bin_val canoncodes[257];
  t_bin_val garbage;  
  unsigned char byte_read;
  
  unsigned int row, prevrow, col, nrows, nline;
  unsigned int code;
  
  nrows = 0;
  nline = 0;
  for (i = 0; i < 80; i++)
  {
     row_firstaddress[i] = 0L;
     row_offset[i] = 0;
  }
  
  
  if (!end_of_data())
  {
    /* ---- 1st pass ---- */
    /* ---- Creation of the Huffman codes ---- */
    puts("Creating the Huffman codes...\n");
    
    /* Create a Huffman tree to compute code lengths */
    tree =  Build_Huffman_Tree(); 
    
    /* Create a encoding lookup table to speed up the accesses */				  
    create_codes_table(tree,encoding, canoncodes);
    
    del_tree(tree);

    
    /* ---- 2nd pass ---- */
    /* ---- Do the compression (encoding) of the data ---- */
    puts("Encoding the input file...\n");
   
    beginning_of_data();
    rewind(ucs2_file);
    
    fscanf(ucs2_file, "%2x%2x", &row, &col);   /* Read next UCS2 code */
    code  = (row << 8) | col;
    
    fprintf(table_file, "const unsigned int row_address[] = \n{\n");

    row_offset[0] = 0;
    while ( !(end = end_of_data()) )
    {
  	  if ( !end )
  	  {
  	    byte_read = read_byte();
  	    /* Encode and write encoded value */
  	    write_bin_val(canoncodes[byte_read]);
      }
      else
      {
        break;
        puts("Fatal Error : end of data is corrupted ! Exiting.\n");
        exit(1);
      }
      
      
      /* Add a new entry to the compressed characters adress table */      
    }
    
    /* Write the code signaling the end of encoding */
    write_bin_val(canoncodes[256]);
    /* Fill the last byte before closing file, if necessary */
    pad_byte();
    printf("Paddings: %d, totalizing %d bytes\n", 
             num_garbage + 1, sum_garbage/8);
    printf("Chinese characters table: %d entries, %d bytes\n", num_entries,
      num_entries*4);			 
  }
}


/* Returned parameters: None
   Action: Displays the help of the program and then stops its running
   Errors: None
*/
void help()
{ 
  puts("This utility enables you to compress a file"); 
  puts("using a straightforward Huffman algorithm.\n");
  puts("Usage: codhuff source target\n");
  puts("source: Name of the file to decompress\n");
  puts("target: Name of the restored file\n");
}


/* Returned parameters: Returns an error code (0=None)
   Action: Main procedure
   Errors: Detected, handled and an error code is returned, if any
*/
int main(int argc,char* argv[])

{   
  /*if (argc != 3)
  { 
    help();
    exit(BAD_ARGUMENT);
  }
  else
  */{ 
    source_file = fopen("us2lite.16","rb");
    dest_file = fopen("us2.huf","w+b");
    ucs2_file = fopen("ishs_62.txt", "r");
    table_file = fopen("chinese_table", "w+");
    rows_file = fopen("rows_table.h", "w+");
		numbits_file = fopen("numbits", "w+");
    code_file = fopen("Hcodes_t", "w+");
    
    if ( (source_file == NULL)||
         (dest_file == NULL) )
    {
      help();
      exit(BAD_FILE_NAME);
    }
    else 
      if ( (ucs2_file == NULL)  ||
           (table_file == NULL) )
    {
      puts("Error opening file 'ishs_62.txt' or 'chinese_table'");
    }
    else 
    {
      generate_row_table();
      fclose(rows_file);
      
      huffmanencoding();
      fclose(source_file);
      fclose(dest_file);
      fclose(table_file);
    }
    printf("Execution of codhuff completed.\n");
    return (NO_ERROR);
  }
}

