/* 22-APR-2002, David Mathog, mathog@caltech.edu
   modified version of gnu uniq.
   
   added switches and modes so that it can emit a range of duplicated
   entries (2->4th), or the first in a run, or the last in a run.
   Useful for handling something like:

   123
   123 extra
   
   and you want the second (or last) line only.
   
   To build this obtain the gnu testutils source. Unpack it.  Drop
   this file on top of "uniq.c" in that distribution, then
   ./configure
   ./make
   ./make check
   
   (It should pass all uniq tests, the extra switches aren't tested.)
   
   One known difference between this and regular "uniq" is the handling
   of uniq -c -w 1.  In this version the LAST in a run is emitted
   and in the original the first.  However they are identical within
   the specified field width.
**********************************************************************

   uniq -- remove duplicate lines from a sorted file
   Copyright (C) 86, 91, 1995-1998, 1999 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software Foundation,
   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */

/* Written by Richard Stallman and David MacKenzie. */

#include <config.h>

#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>

#include "system.h"
#include "linebuffer.h"
#include "error.h"
#include "xstrtol.h"
#include "memcasecmp.h"

/* The official name of this program (e.g., no `g' prefix).  */
#define PROGRAM_NAME "uniq"

#define AUTHORS "Richard Stallman and David MacKenzie"

/* Undefine, to avoid warning about redefinition on some systems.  */
#undef min
#define min(x, y) ((x) < (y) ? (x) : (y))

/* The name this program was run with. */
char *program_name;

/* Number of fields to skip on each line when doing comparisons. */
static int skip_fields;

/* Number of chars to skip after skipping any fields. */
static int skip_chars;

/* Number of chars to compare; if 0, compare the whole lines. */
static int check_chars;

/* In a run of duplicats with -D set these are used to start
emitting at the nth and to emit count lines.   */
static int start_duplicate;
static int count_duplicate;

/* Print the last in a run of duplicates. */
static int last_duplicate;


enum countmode
{
  count_occurrences,		/* -c Print count before output lines. */
  count_none			/* Default.  Do not print counts. */
};

/* Whether and how to precede the output lines with a count of the number of
   times they occurred in the input. */
static enum countmode countmode;

enum output_mode
{
  output_repeated,		/* -d Only lines that are repeated. */
  output_all_repeated,		/* -D All lines that are repeated. */
  output_unique,		/* -u Only lines that are not repeated. */
  output_all			/* Default.  Print first copy of each line. */
};

/* Which lines to output. */
static enum output_mode mode;

/* If nonzero, ignore case when comparing.  */
static int ignore_case;

static struct option const longopts[] =
{
  {"count", no_argument, NULL, 'c'},
  {"repeated", no_argument, NULL, 'd'},
  {"all-repeated", no_argument, NULL, 'D'},
  {"ignore-case", no_argument, NULL, 'i'},
  {"unique", no_argument, NULL, 'u'},
  {"last", no_argument, NULL, 'l'},
  {"start", required_argument, NULL, 'r'},
  {"number", required_argument, NULL, 'n'},
  {"skip-fields", required_argument, NULL, 'f'},
  {"skip-chars", required_argument, NULL, 's'},
  {"check-chars", required_argument, NULL, 'w'},
  {GETOPT_HELP_OPTION_DECL},
  {GETOPT_VERSION_OPTION_DECL},
  {NULL, 0, NULL, 0}
};

void
usage (int status)
{
  if (status != 0)
    fprintf (stderr, _("Try `%s --help' for more information.\n"),
	     program_name);
  else
    {
      printf (_("\
Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
"),
	      program_name);
      printf (_("\
Discard all but one of successive identical lines from INPUT (or\n\
standard input), writing to OUTPUT (or standard output).\n\
\n\
  -c, --count          prefix lines by the number of occurrences\n\
  -d, --repeated       only print duplicate lines\n\
  -D, --all-repeated   print all duplicate lines\n\
  -f, --skip-fields=N  avoid comparing the first N fields\n\
  -r, --start=N        start printing at the Nth duplicated line\n\
  -n, --number=N       print up to N lines of each duplicate\n\
  -l, --last           print the last line in a run of duplicates\n\
  -i, --ignore-case    ignore differences in case when comparing\n\
  -s, --skip-chars=N   avoid comparing the first N characters\n\
  -u, --unique         only print unique lines\n\
  -w, --check-chars=N  compare no more than N characters in lines\n\
  -N                   same as -f N\n\
  +N                   same as -s N\n\
      --help           display this help and exit\n\
      --version        output version information and exit\n\
\n\
A field is a run of whitespace, then non-whitespace characters.\n\
Fields are skipped before chars.\n\
\nVersion 2.0 24-APR-2002 Caltech - modified from 2.0 base\n\
"));
      puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
    }
  exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}

/* Given a linebuffer LINE,
   return a pointer to the beginning of the line's field to be compared. */

static char *
find_field (const struct linebuffer *line)
{
  register int count;
  register char *lp = line->buffer;
  register size_t size = line->length;
  register size_t i = 0;

  for (count = 0; count < skip_fields && i < size; count++)
    {
      while (i < size && ISBLANK (lp[i]))
	i++;
      while (i < size && !ISBLANK (lp[i]))
	i++;
    }

  for (count = 0; count < skip_chars && i < size; count++)
    i++;

  return lp + i;
}

/* Return zero if two strings OLD and NEW match, nonzero if not.
   OLD and NEW point not to the beginnings of the lines
   but rather to the beginnings of the fields to compare.
   OLDLEN and NEWLEN are their lengths. */

static int
different (const char *old, const char *new, size_t oldlen, size_t newlen)
{
  register int order;

  if (check_chars)
    {
      if (oldlen > check_chars)
	oldlen = check_chars;
      if (newlen > check_chars)
	newlen = check_chars;
    }

  /* Use an if-statement here rather than a function variable to
     avoid portability hassles of getting a non-conflicting declaration
     of memcmp.  */
  if (ignore_case)
    order = memcasecmp (old, new, min (oldlen, newlen));
  else
    order = memcmp (old, new, min (oldlen, newlen));

  if (order == 0)
    return oldlen - newlen;
  return order;
}

/* Output the line in linebuffer LINE to stream STREAM
   provided that the switches say it should be output.
   If requested, print the number of times it occurred, as well;
   min(LINECOUNT,1) is the number of times that the line occurred.
   This odd form occurs because linecount is only nonzero if a 
   duplicate is found. */

static void
writeline (const struct linebuffer *line, FILE *stream, int linecount, int match)
{
/* 
(void)printf("linecount %d, match %d, mode %d\n",linecount,match,mode);
 */   
  if (  match ){
    switch (mode){
      case output_repeated:
        if(linecount == 1)return;
        break;
      case output_all_repeated:
        break;
      case output_unique:
        return;
        break;
      case output_all:
        if (countmode == count_occurrences)return;
        if(linecount > 1)return;
        break;
    }
  }
  else { /* could be end of run of duplicates, no match to next line though */
    switch (mode){
      case output_repeated: /* only first duplicated line is emitted */
        if(linecount == 1)return;
        break;
      case output_all_repeated:
        if(linecount < 2)return;
        break;
      case output_unique:  /* not unique, just the last in many duplicates */
        if(linecount > 1)return;
        break;
      case output_all:     /* used for count mode and default */
        if(countmode != count_occurrences && linecount > 1)return;
        break;
    }
  }

  if (countmode == count_occurrences)
    fprintf (stream, "%7d\t", linecount);

  if(mode != output_all_repeated){
     fwrite (line->buffer, sizeof (char), line->length, stream);
  }
  else {
    if( last_duplicate && (linecount==1 || match))return;
    if( linecount < start_duplicate )return;
    if( count_duplicate != 0 &&
        linecount >  start_duplicate + count_duplicate - 1)return;
    fwrite (line->buffer, sizeof (char), line->length, stream);
  }
}

/* Process input file INFILE with output to OUTFILE.
   If either is "-", use the standard I/O stream for it instead. */

static void
check_file (const char *infile, const char *outfile)
{
  FILE *istream;
  FILE *ostream;
  struct linebuffer lb1, lb2;
  struct linebuffer *thisline, *prevline, *exch;
  char *prevfield, *thisfield;
  size_t prevlen, thislen;
  int match_count = 0; /* duplicates when this is >= 2. */
  int match=0;

  if (STREQ (infile, "-"))
    istream = stdin;
  else
    istream = fopen (infile, "r");
  if (istream == NULL)
    error (EXIT_FAILURE, errno, "%s", infile);

  if (STREQ (outfile, "-"))
    ostream = stdout;
  else
    ostream = fopen (outfile, "w");
  if (ostream == NULL)
    error (EXIT_FAILURE, errno, "%s", outfile);

  thisline = &lb1;
  prevline = &lb2;

  initbuffer (thisline);
  initbuffer (prevline);

  if (readline (prevline, istream) == 0)
    goto closefiles;
  prevfield = find_field (prevline);
/*  (void)printf("found prevfield %s\n",prevfield); */
  prevlen = prevline->length - (prevfield - prevline->buffer);

  while (!feof (istream))
    {
      if (readline (thisline, istream) == 0)
	break;
      thisfield = find_field (thisline);
/*  (void)printf("found thisfield %s\n",thisfield); */
      thislen = thisline->length - (thisfield - thisline->buffer);
      match = !different (thisfield, prevfield, thislen, prevlen);
/*  (void)printf("match is %d\n",match); */

      match_count++;

      writeline (prevline, ostream, match_count, match);
      exch = prevline;
      prevline = thisline;
      thisline = exch;
      prevfield = thisfield;
      prevlen = thislen;

      if (!match){
         match_count = 0;  /* 1st instance of this type of line */
      }
        
      
    }
    
  match_count++; /* because it couldn't happen in the loop above */
  match=0;  /* because cannot match to a nonexistant last line */
/*  (void)printf("before final writeline %d\n",match); */
  writeline (prevline, ostream, match_count, match);

 closefiles:
  if (ferror (istream) || fclose (istream) == EOF)
    error (EXIT_FAILURE, errno, _("error reading %s"), infile);

  if (ferror (ostream) || fclose (ostream) == EOF)
    error (EXIT_FAILURE, errno, _("error writing %s"), outfile);

  free (lb1.buffer);
  free (lb2.buffer);
}

int
main (int argc, char **argv)
{
  int optc;
  char *infile = "-", *outfile = "-";

  program_name = argv[0];
  setlocale (LC_ALL, "");
  bindtextdomain (PACKAGE, LOCALEDIR);
  textdomain (PACKAGE);

  skip_chars = 0;
  skip_fields = 0;
  check_chars = 0;
  start_duplicate = 1;
  count_duplicate = 0;
  last_duplicate = 0;
  mode = output_all;
  countmode = count_none;

  while ((optc = getopt_long (argc, argv, "0123456789cdlDf:is:uw:in:ir:", longopts,
			      NULL)) != -1)
    {
      switch (optc)
	{
	case 0:
	  break;

	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	  skip_fields = skip_fields * 10 + optc - '0';
	  break;

	case 'c':
	  countmode = count_occurrences;
	  break;

	case 'd':
	  mode = output_repeated;
	  break;

	case 'D':
	  mode = output_all_repeated;
	  break;
          
	case 'f':		/* Like '-#'. */
	  {
	    long int tmp_long;
	    if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
		|| tmp_long <= 0 || tmp_long > INT_MAX)
	      error (EXIT_FAILURE, 0,
		     _("invalid number of fields to skip: `%s'"),
		     optarg);
	    skip_fields = (int) tmp_long;
	  }
	  break;

	case 'i':
	  ignore_case = 1;
	  break;

	case 'l':
          last_duplicate = 1;
	  mode = output_all_repeated; /* l only valid in this mode */
	  break;

	case 'n':
          {
	    long int tmp_long;
	    if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
	        || tmp_long < 0 || tmp_long > INT_MAX)
	      error (EXIT_FAILURE, 0,
	             _("invalid number of duplicates to print: `%s'"),
	             optarg);
	    count_duplicate = (int) tmp_long;
            mode = output_all_repeated; /* n only valid in this mode */
          }
	  break;
          
	case 'r':
          {
	    long int tmp_long;
	    if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
	        || tmp_long < 0 || tmp_long > INT_MAX)
	      error (EXIT_FAILURE, 0,
	             _("invalid start line to print: `%s'"),
	             optarg);
	    start_duplicate = (int) tmp_long;
            if(start_duplicate == 0)start_duplicate=1;
            mode = output_all_repeated; /* r only valid in this mode */
          }
	  break;
          
	case 's':		/* Like '+#'. */
	  {
	    long int tmp_long;
	    if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
		|| tmp_long <= 0 || tmp_long > INT_MAX)
	      error (EXIT_FAILURE, 0,
		     _("invalid number of bytes to skip: `%s'"),
		     optarg);
	    skip_chars = (int) tmp_long;
	  }
	  break;

	case 'u':
	  mode = output_unique;
	  break;

	case 'w':
	  {
	    long int tmp_long;
	    if (xstrtol (optarg, NULL, 10, &tmp_long, "") != LONGINT_OK
		|| tmp_long <= 0 || tmp_long > INT_MAX)
	      error (EXIT_FAILURE, 0,
		     _("invalid number of bytes to compare: `%s'"),
		     optarg);
	    check_chars = (int) tmp_long;
	  }
	  break;

	case_GETOPT_HELP_CHAR;

	case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);

	default:
	  usage (1);
	}
    }

  if (optind >= 2 && !STREQ (argv[optind - 1], "--"))
    {
      /* Interpret non-option arguments with leading `+' only
	 if we haven't seen `--'.  */
      while (optind < argc && argv[optind][0] == '+')
	{
	  char *opt_str = argv[optind++];
	  long int tmp_long;
	  if (xstrtol (opt_str, NULL, 10, &tmp_long, "") != LONGINT_OK
	      || tmp_long <= 0 || tmp_long > INT_MAX)
	    error (EXIT_FAILURE, 0,
		   _("invalid number of bytes to compare: `%s'"),
		   opt_str);
	  skip_chars = (int) tmp_long;
	}
    }

  if (optind < argc)
    infile = argv[optind++];

  if (optind < argc)
    outfile = argv[optind++];

  if (optind < argc)
    {
      error (0, 0, _("too many arguments"));
      usage (1);
    }

  if (countmode == count_occurrences && mode == output_all_repeated)
    {
      error (0, 0,
	   _("printing all duplicated lines and repeat counts is meaningless"));
      usage (1);
    }

  if (last_duplicate && (start_duplicate !=1 || count_duplicate != 0))
    {
      error (0, 0,
	   _("cannot mix --last with --first or --number"));
      usage (1);
    }

  check_file (infile, outfile);

  exit (EXIT_SUCCESS);
}
