Invalid unicode ranges in CMap beginbfrange operator

Wed Feb 15 00:09:59 CET 2023

Hi Ben,

    https://github.com/veraPDF/veraPDF-library/issues/1253#issuecomment-1420125850

Thanks for the report. I forwarded it to Han The Thanh (pdftex
author&maintainer) and he made the fix appended below (TL r65837).
The test file from your verapdf issue looks ok now, at least it passes
verapdf.

Thanh wrote me:
  I committed a fix; I think it's safe, but I am not sure since I forgot
  most about this topic.  Credit goes to Ben and the guys from veraPDF
  who provided a minimal test file and the explanation.

Best,
Karl

--- ChangeLog	(revision 65836)
+++ ChangeLog	(working copy)
@@ -1,3 +1,14 @@
+2023-02-14  Thanh Han The  <hanthethanh at gmail.com>
+
+	* tounicode.c (set_glyph_unicode): take new glyph_unicode_entry arg.
+	(is_last_byte_valid): new fn.
+	(write_tounicode): stop writing range when last byte of a
+	beginbfrange is no longer valid, that is, >255.
+	Report from Ben JW:
+	  https://tug.org/pipermail/tex-live/2023-February/048845.html
+	and corresponding veraPDF issue:
+	  https://github.com/veraPDF/veraPDF-library/issues/1253#issuecomment-1420125850
+
 2023-02-14  Hironori Kitagawa  <h_kitagawa2001 at yahoo.co.jp>
 
 	* wcfname.test:
Index: NEWS
===================================================================
--- NEWS	(revision 65836)
+++ NEWS	(working copy)
@@ -17,6 +17,8 @@ pdfTeX 3.141592653-2.6-1.40.25
 
 - bugfixes:
   - finish omission of /Info dict when \pdfomitinfodict is not 0.
+  - generated beginbfrange should no longer be invalid with certain
+    characters (that is, no longer have have a last byte >255).
   
 pdfTeX 3.141592653-2.6-1.40.24 (TeX Live 2022)
 - changes:
Index: tounicode.c
===================================================================
--- tounicode.c	(revision 65836)
+++ tounicode.c	(working copy)
@@ -189,7 +189,7 @@ static char *utf16be_str(long code)
  * taking into account tfmname; in case it returns
  * gp->code == UNI_EXTRA_STRING then the caller is responsible for freeing
  * gp->unicode_seq too */
-static void set_glyph_unicode(const char *s, const char* tfmname, 
+static void set_glyph_unicode(const char *s, const char* tfmname,
                               glyph_unicode_entry *gp)
 {
     char buf[SMALL_BUF_SIZE], buf2[SMALL_BUF_SIZE], *p;
@@ -314,7 +314,20 @@ static char *utf16be_str(long code)
     }
 }
 
+static boolean is_last_byte_valid(int srcCode1, int srcCode2, long code)
+{
+    /*
+       When defining ranges of this type, the value of the last byte in the
+       string shall be less than or equal to 255 âˆ’ (srcCode2 âˆ’ srcCode1). This
+       ensures that the last byte of the string shall not be incremented past
+       255; otherwise, the result of mapping is undefined.
+    */
+    char *s = strend(utf16be_str(code)) - 2;
+    long l = strtol(s, NULL, 16);
+    return l < 255 - (srcCode2 - srcCode1);
+}
 
+
 /* tfmname is without .tfm extension, but encname ends in .enc; */
 integer write_tounicode(char **glyph_names, const char *tfmname,
                         const char* encname)
@@ -346,7 +359,7 @@ integer write_tounicode(char **glyph_names, const
             pdftex_warn("Dubious encoding file name: `%s'", encname);
     } else { /* this is a builtin encoding, so name is e.g. "cmr10-builtin" */
         assert(strlen(tfmname) + strlen(builtin_suffix) + 1 < SMALL_BUF_SIZE);
-        strcat(buf, builtin_suffix);    
+        strcat(buf, builtin_suffix);
     }
 
     objnum = pdfnewobjnum();
@@ -389,8 +402,10 @@ integer write_tounicode(char **glyph_names, const
             i++;
         } else {                /* gtab[i].code >= 0 */
             j = i;
-            while (i < 256 && gtab[i + 1].code >= 0 &&
-                   gtab[i].code + 1 == gtab[i + 1].code)
+            while (i < 256 && gtab[i + 1].code >= 0
+                    && gtab[i].code + 1 == gtab[i + 1].code
+                    && is_last_byte_valid(j, i, gtab[i].code)
+                  )
                 i++;
             /* at this point i is the last entry of the subrange */
             i++;                /* move i to the next entry */