Improve base64 BIO correctness and error reporting

Also improve related documentation. - The BIO_FLAGS_BASE64_NO_NL flag did not behave as advertised, only leading and trailing, but not internal, whitespace was supported: $ echo 'AA AA' | openssl base64 -A -d | wc -c 0 - Switching from ignored leading input to valid base64 input misbehaved when the length of the skipped input was one more than the length of the second and subsequent valid base64 lines in the internal 1k buffer: $ printf '#foo\n#bar\nA\nAAA\nAAAA\n' | openssl base64 -d | wc -c 0 - When the underlying BIO is retriable, and a read returns less than 1k of data, some of the already buffered input lines that could have been decoded and returned were retained internally for a retry by the caller. This is somewhat surprising, and the new code decodes as many of the buffered lines as possible. Issue reported by Michał Trojnara. - After all valid data has been read, the next BIO_read(3) should return 0 when the input was all valid or -1 if an error was detected. This now occurs in more consistently, but further tests and code refactoring may be needed to ensure this always happens. Reviewed-by: Neil Horman <[email protected]> Reviewed-by: Tomas Mraz <[email protected]> (Merged from openssl#25253)
quic · Aug 30, 2024 · 0cd9dd7 · 0cd9dd7
1 parent d1c2c05
commit 0cd9dd7
Show file tree

Hide file tree

Showing 8 changed files with 677 additions and 113 deletions.
diff --git a/crypto/evp/bio_b64.c b/crypto/evp/bio_b64.c
@@ -103,9 +103,17 @@ static int b64_free(BIO *a)
     return 1;
 }
 
+/*
+ * Unless `BIO_FLAGS_BASE64_NO_NL` is set, this BIO ignores leading lines that
+ * aren't exclusively composed of valid Base64 characters (followed by <CRLF>
+ * or <LF>).  Once a valid Base64 line is found, `ctx->start` is set to 0 and
+ * lines are processed until EOF or the first line that contains invalid Base64
+ * characters.  In a nod to PEM, lines that start with a '-' (hyphen) are
+ * treated as a soft EOF, rather than an error.
+ */
 static int b64_read(BIO *b, char *out, int outl)
 {
-    int ret = 0, i, ii, j, k, x, n, num, ret_code = 0;
+    int ret = 0, i, ii, j, k, x, n, num, ret_code;
     BIO_B64_CTX *ctx;
     unsigned char *p, *q;
     BIO *next;
@@ -128,7 +136,7 @@ static int b64_read(BIO *b, char *out, int outl)
         EVP_DecodeInit(ctx->base64);
     }
 
-    /* First check if there are bytes decoded/encoded */
+    /* First check if there are buffered bytes already decoded */
     if (ctx->buf_len > 0) {
         OPENSSL_assert(ctx->buf_len >= ctx->buf_off);
         i = ctx->buf_len - ctx->buf_off;
@@ -146,14 +154,17 @@ static int b64_read(BIO *b, char *out, int outl)
         }
     }
 
+    /* Restore any non-retriable error condition (ctx->cont < 0) */
+    ret_code = ctx->cont < 0 ? ctx->cont : 0;
+
     /*
-     * At this point, we have room of outl bytes and an empty buffer, so we
-     * should read in some more.
+     * At this point, we have room of outl bytes and an either an empty buffer,
+     * or outl == 0, so we'll attempt to read in some more.
      */
-
-    ret_code = 0;
     while (outl > 0) {
-        if (ctx->cont <= 0)
+        int again = ctx->cont;
+
+        if (again <= 0)
             break;
 
         i = BIO_read(next, &(ctx->tmp[ctx->tmp_len]),
@@ -164,18 +175,22 @@ static int b64_read(BIO *b, char *out, int outl)
 
             /* Should we continue next time we are called? */
             if (!BIO_should_retry(next)) {
-                ctx->cont = i;
-                /* If buffer empty break */
-                if (ctx->tmp_len == 0)
-                    break;
-                /* Fall through and process what we have */
-                else
-                    i = 0;
+                /* Incomplete final Base64 chunk in the decoder is an error */
+                if (ctx->tmp_len == 0) {
+                    if (EVP_DecodeFinal(ctx->base64, NULL, &num) < 0)
+                        ret_code = -1;
+                    EVP_DecodeInit(ctx->base64);
+                }
+                ctx->cont = ret_code;
             }
-            /* else we retry and add more data to buffer */
-            else
+            if (ctx->tmp_len == 0)
                 break;
+            /* Fall through and process what we have */
+            i = 0;
+            /* But don't loop to top-up even if the buffer is not full! */
+            again = 0;
         }
+
         i += ctx->tmp_len;
         ctx->tmp_len = i;
 
@@ -204,23 +219,23 @@ static int b64_read(BIO *b, char *out, int outl)
                 }
 
                 k = EVP_DecodeUpdate(ctx->base64, ctx->buf, &num, p, q - p);
-                if (k <= 0 && num == 0 && ctx->start) {
-                    EVP_DecodeInit(ctx->base64);
-                } else {
-                    if (p != ctx->tmp) {
-                        i -= p - ctx->tmp;
-                        for (x = 0; x < i; x++)
-                            ctx->tmp[x] = p[x];
-                    }
-                    EVP_DecodeInit(ctx->base64);
-                    ctx->start = 0;
-                    break;
+                EVP_DecodeInit(ctx->base64);
+                if (k <= 0 && num == 0) {
+                    p = q;
+                    continue;
+                }
+
+                ctx->start = 0;
+                if (p != ctx->tmp) {
+                    i -= p - ctx->tmp;
+                    for (x = 0; x < i; x++)
+                        ctx->tmp[x] = p[x];
                 }
-                p = q;
+                break;
             }
 
             /* we fell off the end without starting */
-            if (j == i && num == 0) {
+            if (ctx->start) {
                 /*
                  * Is this is one long chunk?, if so, keep on reading until a
                  * new line.
@@ -231,54 +246,39 @@ static int b64_read(BIO *b, char *out, int outl)
                         ctx->tmp_nl = 1;
                         ctx->tmp_len = 0;
                     }
-                } else if (p != q) { /* finished on a '\n' */
+                } else if (p != q) {
+                    /* Retain partial line at end of buffer */
                     n = q - p;
                     for (ii = 0; ii < n; ii++)
                         ctx->tmp[ii] = p[ii];
                     ctx->tmp_len = n;
+                } else {
+                    /* All we have is newline terminated non-start data */
+                    ctx->tmp_len = 0;
                 }
-                /* else finished on a '\n' */
-                continue;
+                /*
+                 * Try to read more if possible, otherwise we can't make
+                 * progress unless the underlying BIO is retriable and may
+                 * produce more data next time we're called.
+                 */
+                if (again > 0)
+                    continue;
+                else
+                    break;
             } else {
                 ctx->tmp_len = 0;
             }
-        } else if (i < B64_BLOCK_SIZE && ctx->cont > 0) {
+        } else if (i < B64_BLOCK_SIZE && again > 0) {
             /*
              * If buffer isn't full and we can retry then restart to read in
              * more data.
              */
             continue;
         }
 
-        if ((BIO_get_flags(b) & BIO_FLAGS_BASE64_NO_NL) != 0) {
-            int z, jj;
-
-            jj = i & ~3;        /* process per 4 */
-            z = EVP_DecodeBlock(ctx->buf, ctx->tmp, jj);
-            if (jj > 2) {
-                if (ctx->tmp[jj - 1] == '=') {
-                    z--;
-                    if (ctx->tmp[jj - 2] == '=')
-                        z--;
-                }
-            }
-            /*
-             * z is now number of output bytes and jj is the number consumed
-             */
-            if (jj != i) {
-                memmove(ctx->tmp, &ctx->tmp[jj], i - jj);
-                ctx->tmp_len = i - jj;
-            }
-            ctx->buf_len = 0;
-            if (z > 0) {
-                ctx->buf_len = z;
-            }
-            i = z;
-        } else {
-            i = EVP_DecodeUpdate(ctx->base64, ctx->buf, &ctx->buf_len,
-                                 ctx->tmp, i);
-            ctx->tmp_len = 0;
-        }
+        i = EVP_DecodeUpdate(ctx->base64, ctx->buf, &ctx->buf_len,
+                             ctx->tmp, i);
+        ctx->tmp_len = 0;
         /*
          * If eof or an error was signalled, then the condition
          * 'ctx->cont <= 0' will prevent b64_read() from reading
@@ -289,7 +289,7 @@ static int b64_read(BIO *b, char *out, int outl)
 
         ctx->buf_off = 0;
         if (i < 0) {
-            ret_code = 0;
+            ret_code = ctx->start ? 0 : i;
             ctx->buf_len = 0;
             break;
         }

diff --git a/crypto/evp/encode.c b/crypto/evp/encode.c
@@ -443,7 +443,7 @@ static int evp_decodeblock_int(EVP_ENCODE_CTX *ctx, unsigned char *t,
         b = conv_ascii2bin(*(f++), table);
         c = conv_ascii2bin(*(f++), table);
         d = conv_ascii2bin(*(f++), table);
-        if ((a & 0x80) || (b & 0x80) || (c & 0x80) || (d & 0x80))
+        if ((a | b | c | d) & 0x80)
             return -1;
         l = ((((unsigned long)a) << 18L) |
              (((unsigned long)b) << 12L) |

diff --git a/doc/man3/BIO_f_base64.pod b/doc/man3/BIO_f_base64.pod
@@ -25,11 +25,28 @@ For writing, by default output is divided to lines of length 64
 characters and there is a newline at the end of output.
 This behavior can be changed with B<BIO_FLAGS_BASE64_NO_NL> flag.
 
-For reading, first line should be at most 1024 bytes long including newline
-unless the flag B<BIO_FLAGS_BASE64_NO_NL> is set.
-Further input lines can be of any length (i.e., newlines may appear anywhere
+For reading, the first line of base64 content should be at most 1024 bytes long
+including newline unless the flag B<BIO_FLAGS_BASE64_NO_NL> is set.
+Subsequent input lines can be of any length (i.e., newlines may appear anywhere
 in the input) and a newline at the end of input is not needed.
 
+Also when reading, unless the flag B<BIO_FLAGS_BASE64_NO_NL> is set, initial
+lines that contain non-base64 content (whitespace is tolerated and ignored) are
+skipped, as are lines longer than 1024 bytes.
+Decoding starts with the first line that is shorter than 1024 bytes (including
+the newline) and consists of only (at least one) valid base64 characters plus
+optional whitespace.
+Decoding stops when base64 padding is encountered, a soft end-of-input
+character (B<->, see L<EVP_DecodeUpdate(3)>) occurs as the first byte after a
+complete group of 4 valid base64 characters is decoded, or when an error occurs
+(e.g. due to input characters other than valid base64 or whitespace).
+
+If decoding stops as a result of an error, the first L<BIO_read(3)> that
+returns no decoded data will typically return a negative result, rather
+than 0 (which indicates normal end of input).
+However, a negative return value can also occur if the underlying BIO
+supports retries, see L<BIO_should_read(3)> and L<BIO_set_mem_eof_return(3)>.
+
 BIO_flush() on a base64 BIO that is being written through is
 used to signal that no more data is to be encoded: this is used
 to flush the final block through the BIO.
@@ -64,7 +81,7 @@ to standard output:
 
  BIO_free_all(b64);
 
-Read Base64 encoded data from standard input and write the decoded
+Read base64 encoded data from standard input and write the decoded
 data to standard output:
 
  BIO *bio, *b64, *bio_out;
@@ -83,16 +100,30 @@ data to standard output:
 
 =head1 BUGS
 
-On decoding, if the flag B<BIO_FLAGS_BASE64_NO_NL> is not set and
-the first 1024 bytes of input do not include a newline character
-the first two lines of input are ignored.
+The hyphen character (B<->) is treated as an ad hoc soft end-of-input
+character when it occurs at the start of a base64 group of 4 encoded
+characters.
 
-The ambiguity of EOF in base64 encoded data can cause additional
-data following the base64 encoded block to be misinterpreted.
+This heuristic works to detect the ends of base64 blocks in PEM or
+multi-part MIME, provided there are no stray hyphens in the middle
+input.
+But it is just a heuristic, and sufficiently unusual input could produce
+unexpected results.
 
-There should be some way of specifying a test that the BIO can perform
+There should perhaps be some way of specifying a test that the BIO can perform
 to reliably determine EOF (for example a MIME boundary).
 
+It may be possible for L<BIO_read(3)> to return zero, rather than -1, even if
+an error has been detected, more tests are needed to cover all the potential
+error paths.
+
+=head1 SEE ALSO
+
+L<BIO_read(3)>,
+L<BIO_should_read(3)>,
+L<BIO_set_mem_eof_return(3)>,
+L<EVP_DecodeUpdate(3)>.
+
 =head1 COPYRIGHT
 
 Copyright 2000-2022 The OpenSSL Project Authors. All Rights Reserved.