xref: /illumos-gate/usr/src/lib/libsqlite/src/encode.c (revision 1da57d55)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate ** 2002 April 25
37c478bd9Sstevel@tonic-gate **
47c478bd9Sstevel@tonic-gate ** The author disclaims copyright to this source code.  In place of
57c478bd9Sstevel@tonic-gate ** a legal notice, here is a blessing:
67c478bd9Sstevel@tonic-gate **
77c478bd9Sstevel@tonic-gate **    May you do good and not evil.
87c478bd9Sstevel@tonic-gate **    May you find forgiveness for yourself and forgive others.
97c478bd9Sstevel@tonic-gate **    May you share freely, never taking more than you give.
107c478bd9Sstevel@tonic-gate **
117c478bd9Sstevel@tonic-gate *************************************************************************
127c478bd9Sstevel@tonic-gate ** This file contains helper routines used to translate binary data into
137c478bd9Sstevel@tonic-gate ** a null-terminated string (suitable for use in SQLite) and back again.
147c478bd9Sstevel@tonic-gate ** These are convenience routines for use by people who want to store binary
157c478bd9Sstevel@tonic-gate ** data in an SQLite database.  The code in this file is not used by any other
167c478bd9Sstevel@tonic-gate ** part of the SQLite library.
177c478bd9Sstevel@tonic-gate **
187c478bd9Sstevel@tonic-gate ** $Id: encode.c,v 1.12 2004/03/17 18:44:46 drh Exp $
197c478bd9Sstevel@tonic-gate */
207c478bd9Sstevel@tonic-gate #include <string.h>
217c478bd9Sstevel@tonic-gate #include <assert.h>
227c478bd9Sstevel@tonic-gate 
237c478bd9Sstevel@tonic-gate /*
247c478bd9Sstevel@tonic-gate ** How This Encoder Works
257c478bd9Sstevel@tonic-gate **
267c478bd9Sstevel@tonic-gate ** The output is allowed to contain any character except 0x27 (') and
277c478bd9Sstevel@tonic-gate ** 0x00.  This is accomplished by using an escape character to encode
287c478bd9Sstevel@tonic-gate ** 0x27 and 0x00 as a two-byte sequence.  The escape character is always
297c478bd9Sstevel@tonic-gate ** 0x01.  An 0x00 is encoded as the two byte sequence 0x01 0x01.  The
307c478bd9Sstevel@tonic-gate ** 0x27 character is encoded as the two byte sequence 0x01 0x28.  Finally,
317c478bd9Sstevel@tonic-gate ** the escape character itself is encoded as the two-character sequence
327c478bd9Sstevel@tonic-gate ** 0x01 0x02.
337c478bd9Sstevel@tonic-gate **
347c478bd9Sstevel@tonic-gate ** To summarize, the encoder works by using an escape sequences as follows:
357c478bd9Sstevel@tonic-gate **
367c478bd9Sstevel@tonic-gate **       0x00  ->  0x01 0x01
377c478bd9Sstevel@tonic-gate **       0x01  ->  0x01 0x02
387c478bd9Sstevel@tonic-gate **       0x27  ->  0x01 0x28
397c478bd9Sstevel@tonic-gate **
407c478bd9Sstevel@tonic-gate ** If that were all the encoder did, it would work, but in certain cases
417c478bd9Sstevel@tonic-gate ** it could double the size of the encoded string.  For example, to
427c478bd9Sstevel@tonic-gate ** encode a string of 100 0x27 characters would require 100 instances of
437c478bd9Sstevel@tonic-gate ** the 0x01 0x03 escape sequence resulting in a 200-character output.
447c478bd9Sstevel@tonic-gate ** We would prefer to keep the size of the encoded string smaller than
457c478bd9Sstevel@tonic-gate ** this.
467c478bd9Sstevel@tonic-gate **
47*1da57d55SToomas Soome ** To minimize the encoding size, we first add a fixed offset value to each
487c478bd9Sstevel@tonic-gate ** byte in the sequence.  The addition is modulo 256.  (That is to say, if
497c478bd9Sstevel@tonic-gate ** the sum of the original character value and the offset exceeds 256, then
507c478bd9Sstevel@tonic-gate ** the higher order bits are truncated.)  The offset is chosen to minimize
517c478bd9Sstevel@tonic-gate ** the number of characters in the string that need to be escaped.  For
527c478bd9Sstevel@tonic-gate ** example, in the case above where the string was composed of 100 0x27
537c478bd9Sstevel@tonic-gate ** characters, the offset might be 0x01.  Each of the 0x27 characters would
547c478bd9Sstevel@tonic-gate ** then be converted into an 0x28 character which would not need to be
557c478bd9Sstevel@tonic-gate ** escaped at all and so the 100 character input string would be converted
56*1da57d55SToomas Soome ** into just 100 characters of output.  Actually 101 characters of output -
577c478bd9Sstevel@tonic-gate ** we have to record the offset used as the first byte in the sequence so
587c478bd9Sstevel@tonic-gate ** that the string can be decoded.  Since the offset value is stored as
597c478bd9Sstevel@tonic-gate ** part of the output string and the output string is not allowed to contain
607c478bd9Sstevel@tonic-gate ** characters 0x00 or 0x27, the offset cannot be 0x00 or 0x27.
617c478bd9Sstevel@tonic-gate **
627c478bd9Sstevel@tonic-gate ** Here, then, are the encoding steps:
637c478bd9Sstevel@tonic-gate **
647c478bd9Sstevel@tonic-gate **     (1)   Choose an offset value and make it the first character of
657c478bd9Sstevel@tonic-gate **           output.
667c478bd9Sstevel@tonic-gate **
677c478bd9Sstevel@tonic-gate **     (2)   Copy each input character into the output buffer, one by
687c478bd9Sstevel@tonic-gate **           one, adding the offset value as you copy.
697c478bd9Sstevel@tonic-gate **
707c478bd9Sstevel@tonic-gate **     (3)   If the value of an input character plus offset is 0x00, replace
717c478bd9Sstevel@tonic-gate **           that one character by the two-character sequence 0x01 0x01.
727c478bd9Sstevel@tonic-gate **           If the sum is 0x01, replace it with 0x01 0x02.  If the sum
737c478bd9Sstevel@tonic-gate **           is 0x27, replace it with 0x01 0x03.
747c478bd9Sstevel@tonic-gate **
757c478bd9Sstevel@tonic-gate **     (4)   Put a 0x00 terminator at the end of the output.
767c478bd9Sstevel@tonic-gate **
777c478bd9Sstevel@tonic-gate ** Decoding is obvious:
787c478bd9Sstevel@tonic-gate **
79*1da57d55SToomas Soome **     (5)   Copy encoded characters except the first into the decode
807c478bd9Sstevel@tonic-gate **           buffer.  Set the first encoded character aside for use as
817c478bd9Sstevel@tonic-gate **           the offset in step 7 below.
827c478bd9Sstevel@tonic-gate **
837c478bd9Sstevel@tonic-gate **     (6)   Convert each 0x01 0x01 sequence into a single character 0x00.
847c478bd9Sstevel@tonic-gate **           Convert 0x01 0x02 into 0x01.  Convert 0x01 0x28 into 0x27.
857c478bd9Sstevel@tonic-gate **
867c478bd9Sstevel@tonic-gate **     (7)   Subtract the offset value that was the first character of
877c478bd9Sstevel@tonic-gate **           the encoded buffer from all characters in the output buffer.
887c478bd9Sstevel@tonic-gate **
897c478bd9Sstevel@tonic-gate ** The only tricky part is step (1) - how to compute an offset value to
907c478bd9Sstevel@tonic-gate ** minimize the size of the output buffer.  This is accomplished by testing
917c478bd9Sstevel@tonic-gate ** all offset values and picking the one that results in the fewest number
927c478bd9Sstevel@tonic-gate ** of escapes.  To do that, we first scan the entire input and count the
937c478bd9Sstevel@tonic-gate ** number of occurances of each character value in the input.  Suppose
947c478bd9Sstevel@tonic-gate ** the number of 0x00 characters is N(0), the number of occurances of 0x01
957c478bd9Sstevel@tonic-gate ** is N(1), and so forth up to the number of occurances of 0xff is N(255).
967c478bd9Sstevel@tonic-gate ** An offset of 0 is not allowed so we don't have to test it.  The number
977c478bd9Sstevel@tonic-gate ** of escapes required for an offset of 1 is N(1)+N(2)+N(40).  The number
987c478bd9Sstevel@tonic-gate ** of escapes required for an offset of 2 is N(2)+N(3)+N(41).  And so forth.
997c478bd9Sstevel@tonic-gate ** In this way we find the offset that gives the minimum number of escapes,
1007c478bd9Sstevel@tonic-gate ** and thus minimizes the length of the output string.
1017c478bd9Sstevel@tonic-gate */
1027c478bd9Sstevel@tonic-gate 
1037c478bd9Sstevel@tonic-gate /*
1047c478bd9Sstevel@tonic-gate ** Encode a binary buffer "in" of size n bytes so that it contains
105*1da57d55SToomas Soome ** no instances of characters '\'' or '\000'.  The output is
1067c478bd9Sstevel@tonic-gate ** null-terminated and can be used as a string value in an INSERT
1077c478bd9Sstevel@tonic-gate ** or UPDATE statement.  Use sqlite_decode_binary() to convert the
1087c478bd9Sstevel@tonic-gate ** string back into its original binary.
1097c478bd9Sstevel@tonic-gate **
1107c478bd9Sstevel@tonic-gate ** The result is written into a preallocated output buffer "out".
1117c478bd9Sstevel@tonic-gate ** "out" must be able to hold at least 2 +(257*n)/254 bytes.
1127c478bd9Sstevel@tonic-gate ** In other words, the output will be expanded by as much as 3
1137c478bd9Sstevel@tonic-gate ** bytes for every 254 bytes of input plus 2 bytes of fixed overhead.
1147c478bd9Sstevel@tonic-gate ** (This is approximately 2 + 1.0118*n or about a 1.2% size increase.)
1157c478bd9Sstevel@tonic-gate **
1167c478bd9Sstevel@tonic-gate ** The return value is the number of characters in the encoded
1177c478bd9Sstevel@tonic-gate ** string, excluding the "\000" terminator.
1187c478bd9Sstevel@tonic-gate **
1197c478bd9Sstevel@tonic-gate ** If out==NULL then no output is generated but the routine still returns
1207c478bd9Sstevel@tonic-gate ** the number of characters that would have been generated if out had
1217c478bd9Sstevel@tonic-gate ** not been NULL.
1227c478bd9Sstevel@tonic-gate */
sqlite_encode_binary(const unsigned char * in,int n,unsigned char * out)1237c478bd9Sstevel@tonic-gate int sqlite_encode_binary(const unsigned char *in, int n, unsigned char *out){
1247c478bd9Sstevel@tonic-gate   int i, j, e, m;
1257c478bd9Sstevel@tonic-gate   unsigned char x;
1267c478bd9Sstevel@tonic-gate   int cnt[256];
1277c478bd9Sstevel@tonic-gate   if( n<=0 ){
1287c478bd9Sstevel@tonic-gate     if( out ){
1297c478bd9Sstevel@tonic-gate       out[0] = 'x';
1307c478bd9Sstevel@tonic-gate       out[1] = 0;
1317c478bd9Sstevel@tonic-gate     }
1327c478bd9Sstevel@tonic-gate     return 1;
1337c478bd9Sstevel@tonic-gate   }
1347c478bd9Sstevel@tonic-gate   memset(cnt, 0, sizeof(cnt));
1357c478bd9Sstevel@tonic-gate   for(i=n-1; i>=0; i--){ cnt[in[i]]++; }
1367c478bd9Sstevel@tonic-gate   m = n;
1377c478bd9Sstevel@tonic-gate   for(i=1; i<256; i++){
1387c478bd9Sstevel@tonic-gate     int sum;
1397c478bd9Sstevel@tonic-gate     if( i=='\'' ) continue;
1407c478bd9Sstevel@tonic-gate     sum = cnt[i] + cnt[(i+1)&0xff] + cnt[(i+'\'')&0xff];
1417c478bd9Sstevel@tonic-gate     if( sum<m ){
1427c478bd9Sstevel@tonic-gate       m = sum;
1437c478bd9Sstevel@tonic-gate       e = i;
1447c478bd9Sstevel@tonic-gate       if( m==0 ) break;
1457c478bd9Sstevel@tonic-gate     }
1467c478bd9Sstevel@tonic-gate   }
1477c478bd9Sstevel@tonic-gate   if( out==0 ){
1487c478bd9Sstevel@tonic-gate     return n+m+1;
1497c478bd9Sstevel@tonic-gate   }
1507c478bd9Sstevel@tonic-gate   out[0] = e;
1517c478bd9Sstevel@tonic-gate   j = 1;
1527c478bd9Sstevel@tonic-gate   for(i=0; i<n; i++){
1537c478bd9Sstevel@tonic-gate     x = in[i] - e;
1547c478bd9Sstevel@tonic-gate     if( x==0 || x==1 || x=='\''){
1557c478bd9Sstevel@tonic-gate       out[j++] = 1;
1567c478bd9Sstevel@tonic-gate       x++;
1577c478bd9Sstevel@tonic-gate     }
1587c478bd9Sstevel@tonic-gate     out[j++] = x;
1597c478bd9Sstevel@tonic-gate   }
1607c478bd9Sstevel@tonic-gate   out[j] = 0;
1617c478bd9Sstevel@tonic-gate   assert( j==n+m+1 );
1627c478bd9Sstevel@tonic-gate   return j;
1637c478bd9Sstevel@tonic-gate }
1647c478bd9Sstevel@tonic-gate 
1657c478bd9Sstevel@tonic-gate /*
1667c478bd9Sstevel@tonic-gate ** Decode the string "in" into binary data and write it into "out".
1677c478bd9Sstevel@tonic-gate ** This routine reverses the encoding created by sqlite_encode_binary().
1687c478bd9Sstevel@tonic-gate ** The output will always be a few bytes less than the input.  The number
1697c478bd9Sstevel@tonic-gate ** of bytes of output is returned.  If the input is not a well-formed
1707c478bd9Sstevel@tonic-gate ** encoding, -1 is returned.
1717c478bd9Sstevel@tonic-gate **
1727c478bd9Sstevel@tonic-gate ** The "in" and "out" parameters may point to the same buffer in order
1737c478bd9Sstevel@tonic-gate ** to decode a string in place.
1747c478bd9Sstevel@tonic-gate */
sqlite_decode_binary(const unsigned char * in,unsigned char * out)1757c478bd9Sstevel@tonic-gate int sqlite_decode_binary(const unsigned char *in, unsigned char *out){
1767c478bd9Sstevel@tonic-gate   int i, e;
1777c478bd9Sstevel@tonic-gate   unsigned char c;
1787c478bd9Sstevel@tonic-gate   e = *(in++);
1797c478bd9Sstevel@tonic-gate   i = 0;
1807c478bd9Sstevel@tonic-gate   while( (c = *(in++))!=0 ){
1817c478bd9Sstevel@tonic-gate     if( c==1 ){
1827c478bd9Sstevel@tonic-gate       c = *(in++) - 1;
1837c478bd9Sstevel@tonic-gate     }
1847c478bd9Sstevel@tonic-gate     out[i++] = c + e;
1857c478bd9Sstevel@tonic-gate   }
1867c478bd9Sstevel@tonic-gate   return i;
1877c478bd9Sstevel@tonic-gate }
1887c478bd9Sstevel@tonic-gate 
1897c478bd9Sstevel@tonic-gate #ifdef ENCODER_TEST
1907c478bd9Sstevel@tonic-gate #include <stdio.h>
1917c478bd9Sstevel@tonic-gate /*
1927c478bd9Sstevel@tonic-gate ** The subroutines above are not tested by the usual test suite.  To test
1937c478bd9Sstevel@tonic-gate ** these routines, compile just this one file with a -DENCODER_TEST=1 option
1947c478bd9Sstevel@tonic-gate ** and run the result.
1957c478bd9Sstevel@tonic-gate */
main(int argc,char ** argv)1967c478bd9Sstevel@tonic-gate int main(int argc, char **argv){
1977c478bd9Sstevel@tonic-gate   int i, j, n, m, nOut, nByteIn, nByteOut;
1987c478bd9Sstevel@tonic-gate   unsigned char in[30000];
1997c478bd9Sstevel@tonic-gate   unsigned char out[33000];
2007c478bd9Sstevel@tonic-gate 
2017c478bd9Sstevel@tonic-gate   nByteIn = nByteOut = 0;
2027c478bd9Sstevel@tonic-gate   for(i=0; i<sizeof(in); i++){
2037c478bd9Sstevel@tonic-gate     printf("Test %d: ", i+1);
2047c478bd9Sstevel@tonic-gate     n = rand() % (i+1);
2057c478bd9Sstevel@tonic-gate     if( i%100==0 ){
2067c478bd9Sstevel@tonic-gate       int k;
2077c478bd9Sstevel@tonic-gate       for(j=k=0; j<n; j++){
2087c478bd9Sstevel@tonic-gate         /* if( k==0 || k=='\'' ) k++; */
2097c478bd9Sstevel@tonic-gate         in[j] = k;
2107c478bd9Sstevel@tonic-gate         k = (k+1)&0xff;
2117c478bd9Sstevel@tonic-gate       }
2127c478bd9Sstevel@tonic-gate     }else{
2137c478bd9Sstevel@tonic-gate       for(j=0; j<n; j++) in[j] = rand() & 0xff;
2147c478bd9Sstevel@tonic-gate     }
2157c478bd9Sstevel@tonic-gate     nByteIn += n;
2167c478bd9Sstevel@tonic-gate     nOut = sqlite_encode_binary(in, n, out);
2177c478bd9Sstevel@tonic-gate     nByteOut += nOut;
2187c478bd9Sstevel@tonic-gate     if( nOut!=strlen(out) ){
2197c478bd9Sstevel@tonic-gate       printf(" ERROR return value is %d instead of %d\n", nOut, strlen(out));
2207c478bd9Sstevel@tonic-gate       exit(1);
2217c478bd9Sstevel@tonic-gate     }
2227c478bd9Sstevel@tonic-gate     if( nOut!=sqlite_encode_binary(in, n, 0) ){
2237c478bd9Sstevel@tonic-gate       printf(" ERROR actual output size disagrees with predicted size\n");
2247c478bd9Sstevel@tonic-gate       exit(1);
2257c478bd9Sstevel@tonic-gate     }
2267c478bd9Sstevel@tonic-gate     m = (256*n + 1262)/253;
2277c478bd9Sstevel@tonic-gate     printf("size %d->%d (max %d)", n, strlen(out)+1, m);
2287c478bd9Sstevel@tonic-gate     if( strlen(out)+1>m ){
2297c478bd9Sstevel@tonic-gate       printf(" ERROR output too big\n");
2307c478bd9Sstevel@tonic-gate       exit(1);
2317c478bd9Sstevel@tonic-gate     }
2327c478bd9Sstevel@tonic-gate     for(j=0; out[j]; j++){
2337c478bd9Sstevel@tonic-gate       if( out[j]=='\'' ){
2347c478bd9Sstevel@tonic-gate         printf(" ERROR contains (')\n");
2357c478bd9Sstevel@tonic-gate         exit(1);
2367c478bd9Sstevel@tonic-gate       }
2377c478bd9Sstevel@tonic-gate     }
2387c478bd9Sstevel@tonic-gate     j = sqlite_decode_binary(out, out);
2397c478bd9Sstevel@tonic-gate     if( j!=n ){
2407c478bd9Sstevel@tonic-gate       printf(" ERROR decode size %d\n", j);
2417c478bd9Sstevel@tonic-gate       exit(1);
2427c478bd9Sstevel@tonic-gate     }
2437c478bd9Sstevel@tonic-gate     if( memcmp(in, out, n)!=0 ){
2447c478bd9Sstevel@tonic-gate       printf(" ERROR decode mismatch\n");
2457c478bd9Sstevel@tonic-gate       exit(1);
2467c478bd9Sstevel@tonic-gate     }
2477c478bd9Sstevel@tonic-gate     printf(" OK\n");
2487c478bd9Sstevel@tonic-gate   }
2497c478bd9Sstevel@tonic-gate   fprintf(stderr,"Finished.  Total encoding: %d->%d bytes\n",
2507c478bd9Sstevel@tonic-gate           nByteIn, nByteOut);
2517c478bd9Sstevel@tonic-gate   fprintf(stderr,"Avg size increase: %.3f%%\n",
2527c478bd9Sstevel@tonic-gate     (nByteOut-nByteIn)*100.0/(double)nByteIn);
2537c478bd9Sstevel@tonic-gate }
2547c478bd9Sstevel@tonic-gate #endif /* ENCODER_TEST */
255