Hi,

I have a set of working patches for "fossil export" and "fossil import"
as attached.

Please note these are clean reverse engineered dedes independent of the
code in the git using the code analysis result included.

These codes are BSD-2-Clause licensed and I agree to copyright
assignment to Hipp, Wyrick & Company, Inc. to any patches submitted to
this ML.

Please review and consider.

Osamu

PS:  The previous "fossil import" code could not handle some
non-standard filenames.
From: Osamu Aoki <os...@debian.org>
Date: Tue, 11 Oct 2016 23:45:30 +0900
Subject: Fix export

Newline and UTF-8 characters in filename is now allowed

The git-fast-export code was analyzed as below.  This analysis was used to
create an independent code as this patch.

== Git fast-export filename encoding spec ==

Filenames exported by "git fast-export" are encoded and quoted if they contain
some non-plain ASCII alphanumeric characters.

Filenames imported by "git fast-import" are unencoded if it is recorded within
double quotes.

This memo documents the details of encoding and quotation to enable creating a
clean reverse engineered GPL unencumbered code in BSD-2-Clause license.

In Git 2.9.3, the focal point defining this feature:

    * quote.c                 quote_c_style
        Escape odd characters and quote the escaped string
        Return TRUE if used

    * builtin/fast-export.c   print_path_1
        Use quote_c_style if if finds needs to quote and escape
        Otherwise, quote entire string if ' ' is found in it
        If neither, use the original filename to export

    * quote.c                 unquote_c_style
        Unescape escaped characters

The following table summarizes the end result for each character found in the
filename for the combined effects of the above 2 functions:

             01234567 89abcdef
        0x00 OOOOOOOa btnvfrOO
        0x10 OOOOOOOO OOOOOOOO
        0x20 QAEAAAAA AAAAAAAA  (quote space and escape double quote)
        0x30 AAAAAAAA AAAAAAAA
        0x40 AAAAAAAA AAAAAAAA
        0x50 AAAAAAAA AAAAEAAA  (escape backslash)
        0x60 AAAAAAAA AAAAAAAA
        0x70 AAAAAAAA AAAAAAAO  (DEL is octal!)
        0x80 OOOOOOOO OOOOOOOO
        0x90 OOOOOOOO OOOOOOOO
        0xa0 OOOOOOOO OOOOOOOO
        0xb0 OOOOOOOO OOOOOOOO
        0xc0 OOOOOOOO OOOOOOOO
        0xd0 OOOOOOOO OOOOOOOO
        0xe0 OOOOOOOO OOOOOOOO
        0xf0 OOOOOOOO OOOOOOOO

Here each character is represented as HEX number adding row and column index.

The default of the quoting flag is FALSE.  The meaning of the conversion rule
indicators are the following:
---
 src/export.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 4 deletions(-)

diff --git a/src/export.c b/src/export.c
index f524cdc..e677351 100644
--- a/src/export.c
+++ b/src/export.c
@@ -305,6 +305,49 @@ void export_marks(FILE* f, Bag *blobs, Bag *vers){
 }
 
 /*
+** Quote and escape a filename to be exported if it contains some special
+** characters.  This implements not only the minimum requirements
+** '\\', '"' and '\n' but also 3 digits octal escapes for all high bits
+** characters and other standard single character escapes such as
+** '\a'..'\r' on par with what the git-fast-export does.
+*/
+static void quote_git_filename(const char *zName, char *name){
+  int i, j;
+  int needQuote;
+  static char escs[] = "abtnvfr";
+  needQuote = 0;
+  for(i=0; zName[i]!=0; i++){
+    if( zName[i]<=' ' || zName[i]>'~' || zName[i]=='\\' || zName[i]=='"' ){
+      needQuote = 1;
+    }
+  }
+  j = 0;
+  if( needQuote==1 ){
+    name[j++] = '"';
+  }
+  for(i=0; zName[i]!=0; i++){
+    if( zName[i]=='\\' || zName[i]=='"' ){
+      name[j++] = '\\';
+      name[j++] = zName[i];
+    }else if ( zName[i]>=' ' && zName[i]<='~' ){
+      name[j++] = zName[i];
+    }else if( zName[i]>='\a' && zName[i]<='\r' ){
+      name[j++] = '\\';
+      name[j++] = escs[( zName[i] - '\a')];
+    }else{
+      name[j++] = '\\';
+      name[j++] = ((zName[i] >> 6) & '\03') + '0';
+      name[j++] = ((zName[i] >> 3) & '\07') + '0';
+      name[j++] = ((zName[i] >> 0) & '\07') + '0';
+    }
+  }
+  if( needQuote==1 ){
+    name[j++] = '"';
+  }
+  name[j] = '\0';
+}
+
+/*
 ** COMMAND: export
 **
 ** Usage: %fossil export --git ?OPTIONS? ?REPOSITORY?
@@ -516,19 +559,28 @@ void export_cmd(void){
     );
     while( db_step(&q4)==SQLITE_ROW ){
       const char *zName = db_column_text(&q4,0);
+      char *name;
       int zNew = db_column_int(&q4,1);
       int mPerm = db_column_int(&q4,2);
-      if( zNew==0)
-        printf("D %s\n", zName);
-      else if( bag_find(&blobs, zNew) ) {
+      if( zName==0 || zName=="" ){
+        name = fossil_malloc( 1 );
+        name[0] = '\0';
+      }else{
+        name = fossil_malloc( 4*strlen(zName) + 2 + 1 );
+      }
+      quote_git_filename(zName, name);
+      if( zNew==0 ){
+        printf("D %s\n", name);
+      }else if( bag_find(&blobs, zNew) ){
         const char *zPerm;
         switch( mPerm ){
           case PERM_LNK:  zPerm = "120000";   break;
           case PERM_EXE:  zPerm = "100755";   break;
           default:        zPerm = "100644";   break;
         }
-        printf("M %s :%d %s\n", zPerm, BLOBMARK(zNew), zName);
+        printf("M %s :%d %s\n", zPerm, BLOBMARK(zNew), name);
       }
+      fossil_free(name);
     }
     db_finalize(&q4);
     db_finalize(&q3);
From: Osamu Aoki <os...@debian.org>
Date: Thu, 13 Oct 2016 21:55:12 +0900
Subject: Fix import

Single chapacter escape sequences are properly handled

The git-fast-export code was analyzed.  This analysis was used to create 
an independent code as this patch.

The code is written to be robust to accept non-conforming inputs.

The octal sequence is always with 3 digits as git-fast-import.
---
 src/import.c | 67 ++++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/src/import.c b/src/import.c
index b74186c..57e65f2 100644
--- a/src/import.c
+++ b/src/import.c
@@ -477,36 +477,63 @@ static ImportFile *import_find_file(const char *zName, int *pI, int mx){
 }
 
 /*
-** Dequote a fast-export filename.  Filenames are normally unquoted.  But
-** if the contain some obscure special characters, quotes might be added.
+** Dequote and unescape a git-fast-export filename.  Filenames are normally
+** unquoted by the git-fast-export command.  But if they contain some special
+*: characters, special characters are escaped and quotes are added.  This
+** function reverses this conversion to get the original filename.
 */
 static void dequote_git_filename(char *zName){
-  int n, i, j;
   if( zName==0 || zName[0]!='"' ) return;
-  n = (int)strlen(zName);
+  int n = (int)strlen(zName);
   if( zName[n-1]!='"' ) return;
-  for(i=0, j=1; j<n-1; j++){
-    char c = zName[j];
-    int x;
-    if( c=='\\' ){
-      if( j+3 <= n-1
-       && zName[j+1]>='0' && zName[j+1]<='3'
-       && zName[j+2]>='0' && zName[j+2]<='7'
-       && zName[j+3]>='0' && zName[j+3]<='7'
-       && (x = 64*(zName[j+1]-'0') + 8*(zName[j+2]-'0') + zName[j+3]-'0')!=0
-      ){
-        c = (unsigned char)x;
-        j += 3;
+  /* zName is quoted */
+  int i=0; /* destination index, always i<j */
+  int j=1; /* source index: 0..n-1 */
+  while( j<n-1 ){
+    if( zName[j]=='\\' ){
+      if( j+1<=n-1 && !( zName[j+1]<'0' || zName[j+1]>'3' ) ){
+        switch( zName[j+1] ){
+          case 'a': zName[i++] = '\a'; break;
+          case 'b': zName[i++] = '\b'; break;
+          case 't': zName[i++] = '\t'; break;
+          case 'n': zName[i++] = '\n'; break;
+          case 'v': zName[i++] = '\v'; break;
+          case 'f': zName[i++] = '\f'; break;
+          case 'r': zName[i++] = '\r'; break;
+          case '\\': zName[i++] = '\\'; break;
+          case '"': zName[i++] = '"'; break;
+          default: /* illegal sequence */
+                    zName[i++] = '\\';
+                    zName[i++] = zName[j+1]; break;
+        }
+        j += 2;
+      }else if( j+3<=n-1
+                && zName[j+1]>='0' && zName[j+1]<='3'
+                && zName[j+2]>='0' && zName[j+2]<='7'
+                && zName[j+3]>='0' && zName[j+3]<='7' ){
+        /* Only 3 digits are used by the git-fast-export octal escape */
+        zName[i++] = (zName[j+1]-'0')<<6;
+        zName[i++] |= (zName[j+2]-'0')<<3;
+        zName[i++] |= (zName[j+3]-'0')<<0;
+        j += 4;
+      }else if(j+1<=n-1){
+        /* illegal sequence */
+        zName[i++] = zName[j];
+        zName[i++] = zName[j+1];
+        j += 2;
       }else{
-        c = zName[++j];
+        /* illegal sequence */
+        zName[i++] = zName[j];
+        j++;
       }
+    }else{
+      zName[i++] = zName[j];
+      j++;
     }
-    zName[i++] = c;
   }
-  zName[i] = 0;
+  zName[i] = '\0';
 }
 
-
 /*
 ** Read the git-fast-import format from pIn and insert the corresponding
 ** content into the database.

Attachment: signature.asc
Description: PGP signature

Reply via email to