Welcome to little lamb

Code » limb » commit 6e221e6

Add esc.h & related functions: To (un)escape text/data

author Olivier Brunel
2023-04-17 17:20:53 UTC
committer Olivier Brunel
2023-05-20 18:06:37 UTC
parent 3aa5cac08ad0974b0741d84bc8753d04adcc3ad1

Add esc.h & related functions: To (un)escape text/data

Now buffer_putesc() & co are simply wrappers around escall_fmt(), so it
is possible to implement such escaping outside of buffer use.

It is also possible to read back escaped data into their
original/unescaped form via e.g. escall_scan()

src/doc/buffer.h/buffer_putescall.3.md +1 -35
src/doc/esc.h.0.md +32 -0
src/doc/esc.h/escall_fmt.3.md +128 -0
src/liblimb/buffer.h/buffer_putescall.c +6 -64
src/liblimb/esc.h/esc_fmt.c +14 -0
src/liblimb/esc.h/esc_scan.c +14 -0
src/liblimb/esc.h/escall_fmt.c +76 -0
src/liblimb/esc.h/escall_scan.c +45 -0
src/liblimb/include/limb/esc.h +15 -0
src/mkrabintables/escall_fmt.o +1 -0

diff --git a/src/doc/buffer.h/buffer_putescall.3.md b/src/doc/buffer.h/buffer_putescall.3.md
index 9eef985..2bbabee 100644
--- a/src/doc/buffer.h/buffer_putescall.3.md
+++ b/src/doc/buffer.h/buffer_putescall.3.md
@@ -32,41 +32,7 @@ start.
 The `buffer_putescs`() function does the same as `buffer_putesc`() but expects
 `s` to be a NUL-terminated string.
 
-# ESCAPING
-
-The escaping performed is intended to have the written value inside
-double-quotes. Characters with special escaping are as follow :
-
-: double-quote (`"`)
-:: Escaped by prefixing with a backslash (`\`)
-
-: backslash (`\`)
-:: Escaped by prefixing with a backslash (`\`)
-
-: bell (`0x07`)
-:: Escaped as `\a`
-
-: backspace (`0x08`)
-:: Escaped as `\b`
-
-: tabulation (`0x09`)
-:: Escaped as `\t`
-
-: line feed (`0x0a`)
-:: Escaped as `\n`
-
-: vertical tabulation (`0x0b`)
-:: Escaped as `\v`
-
-: form feed (`0x0c`)
-:: Escaped as `\f`
-
-: carriage return (`0x0d`)
-:: Escaped as `\r`
-
-Anything else will either be written as-in if recognized as a printable
-character, else escaped in hex-mode, that is `\x` followed by the hexadecimal
-code of the character. (For example, character 127 would be escaped as `\x7f`)
+For more about the escaping performed, refer to [esc_fmt](3).
 
 # RETURN VALUE
 
diff --git a/src/doc/esc.h.0.md b/src/doc/esc.h.0.md
new file mode 100644
index 0000000..76ae796
--- /dev/null
+++ b/src/doc/esc.h.0.md
@@ -0,0 +1,32 @@
+% limb manual
+% esc.h(0)
+
+# NAME
+
+esc.h - escaping/unescaping data
+
+
+# SYNOPSIS
+
+    #include <limb/esc.h>
+
+
+# DESCRIPTION
+
+The header defines functions to needed to escape/unescape data.
+
+## Functions
+
+The following functions are defined :
+
+: [escall_fmt](3)
+:: To escape given text/data.
+
+: [escall_scan](3)
+:: To unescape data previsouly escaped via [escall_fmt](3).
+
+: [esc_fmt](3)
+:: Similar to [escall_fmt](3) but don't handle partial processing.
+
+: [esc_scan](3)
+:: Similar to [escall_scan](3) but don't handle partial processing.
diff --git a/src/doc/esc.h/escall_fmt.3.md b/src/doc/esc.h/escall_fmt.3.md
new file mode 100644
index 0000000..20a4168
--- /dev/null
+++ b/src/doc/esc.h/escall_fmt.3.md
@@ -0,0 +1,128 @@
+% limb manual
+% escall_fmt(3)
+
+# NAME
+
+escall\_fmt, esc\_fmt - escape given text/data
+
+# SYNOPSIS
+
+    #include <limb/sc.h>
+
+```pre hl
+int escall_fmt(char *<em>dst</em>, size_t <em>dlen</em>, const char *<em>sce</em>, size_t <em>slen</em>, size_t *<em>w</em>, size_t *<em>r</em>)
+int escall_scan(char *<em>dst</em>, size_t <em>dlen</em>, const char *<em>sce</em>, size_t <em>slen</em>, size_t *<em>w</em>, size_t *<em>r</em>)
+
+ssize_t esc_fmt(char *<em>dst</em>, size_t <em>dlen</em>, const char *<em>sce</em>, size_t <em>slen</em>)
+ssize_t esc_scan(char *<em>dst</em>, size_t <em>dlen</em>, const char *<em>sce</em>, size_t <em>slen</em>)
+```
+
+# DESCRIPTION
+
+The `escall_fmt`() function will write the content of `sce` of length `slen`
+bytes, starting at offset pointed by `r` (usually 0), into the memory area
+pointed by `dst` starting at position `w` (usually 0) and never going past
+`dlen`, whilst taking care of escaping characters as needed. This means more
+characters might be written into `dst` than present in `sce`.
+
+The values pointed to by `r` and `w` are updated accordingly to reflect the
+positions of the last read in `sce` and write in `dst`, respectively.
+Specifically, if a byte couldn't be escaped for lack of space in `dst`, the
+value pointed to by `r` would remain on the last successfully processed byte,
+and no "partial write" would have occurred in `dst`.
+
+If `dst` is *NULL* then nothing is written, the data in `sce` is still processed
+and both `r` and `w` (/both/ mandatory) updated accordingly.
+
+The `esc_fmt`() function is similar, only without the `r` and `w` arguments, and
+different return values.
+
+The `escall_scan`() function will read the data pointed to be `sce` of length
+`slen` bytes, starting at offset pointed by `r` (usually 0), and write in the
+memory area pointed by `dst` (up to `dlen` bytes) starting at offset pointed by
+`w` the result of unescaping said data, expected by have been previously
+escaped via `escall_fmt`().
+
+Similarly to `escall_fmt`() the values pointed to by `r` and `w` are updated
+accordingly.
+
+The `esc_scan`() function is similar to `escall_scan`() only without the `r` and
+`w` argument, and different return values.
+
+# ESCAPING
+
+The escaping performed is intended to have the written value inside
+double-quotes. Characters with special escaping are as follow :
+
+: double-quote (`"`)
+:: Escaped by prefixing with a backslash (`\`)
+
+: backslash (`\`)
+:: Escaped by prefixing with a backslash (`\`)
+
+: bell (`0x07`)
+:: Escaped as `\a`
+
+: backspace (`0x08`)
+:: Escaped as `\b`
+
+: tabulation (`0x09`)
+:: Escaped as `\t`
+
+: line feed (`0x0a`)
+:: Escaped as `\n`
+
+: vertical tabulation (`0x0b`)
+:: Escaped as `\v`
+
+: form feed (`0x0c`)
+:: Escaped as `\f`
+
+: carriage return (`0x0d`)
+:: Escaped as `\r`
+
+Anything else will either be written as-in if recognized as a printable
+character in the current locale, else escaped in hex-mode, that is `\x` followed
+by the hexadecimal code of the character. (For example, character 127 would be
+escaped as `\x7f`)
+
+# RETURN VALUE
+
+The `escall_fmt`() and `escall_scan`() function return 1 on success. Otherwise
+they return 0 and set `errno` to indicate the error.
+
+In either case, values pointed to by `r` and `w` will have been updated to
+reflect the positions of the last read in `sce` and write in `dst`,
+respectively.
+
+The `esc_fmt`() and `esc_scan`() functions return the number of bytes written
+into `dst` on success. Otherwise they return -1 and set `errno` to indicate the
+error.
+
+# ERRORS
+
+The `escall_fmt`(), `esc_fmt`(), `escall_scan`() and `esc_scan`() function may
+fail if :
+
+: *ENOBUFS*
+:: Not enough space in `dst`.
+
+The `escall_fmt`() and `escall_scan`() functions may also fail if :
+
+: *EINVAL*
+:: Either `r` or `w` was too high (more than `slen` or `dlen`, respectively).
+
+The `escall_scan`() and `esc_scan`() functions may also fail if :
+
+: *EINVAL*
+:: Data in `sce` is malformed/invalid. E.g. a backslash followed by other than
+:: an allowed byte.
+
+# NOTES
+
+The behavior of the `escall_fmt`() and `esc_fmt`() functions depend on the
+*LC_CTYPE* category of the current locale.
+
+# SEE ALSO
+
+[buffer_putesc](3)
diff --git a/src/liblimb/buffer.h/buffer_putescall.c b/src/liblimb/buffer.h/buffer_putescall.c
index 69a2ed9..a8681db 100644
--- a/src/liblimb/buffer.h/buffer_putescall.c
+++ b/src/liblimb/buffer.h/buffer_putescall.c
@@ -2,80 +2,22 @@
  * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
 /* SPDX-License-Identifier: GPL-2.0-only */
 #include <errno.h>
-#include <string.h>
-#include <ctype.h>
-#include <wctype.h>
-#include <wchar.h>
-#include <skalibs/fmtscan.h>
 #include <limb/buffer.h>
+#include <limb/esc.h>
 
 size_t
 buffer_putescall(buffer *b, const char *s, size_t len, size_t *pos)
 {
     if (*pos > len) return (errno = EINVAL, 0);
-
-    const char direct[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
-    char tmp[4] = "\\";
     size_t written = 0;
 
     for (;;) {
-        size_t w;
-
-        /* common chars we can directly put */
-        if (memchr(direct, s[*pos], sizeof(direct) - 1)) {
-            w = buffer_putnoflush(b, s + *pos, 1);
-            written += w;
-        } else {
-            const char *t;
-            size_t l;
-
-            /* basic backslash escaping */
-            if (s[*pos] == '\\' || s[*pos] == '"') {
-                tmp[1] = s[*pos];
-                t = tmp;
-                l = 2;
-                w = 1;
-            /* simple backslash escaping */
-            } else if (s[*pos] >= 7 && s[*pos] <= 13) {
-                const char esc[7] = "abtnvfr";
-                tmp[1] = esc[s[*pos] - 7];
-                t = tmp;
-                l = 2;
-                w = 1;
-            } else {
-                mbstate_t state = { 0 };
-                /* try to get a multibyte char */
-                wchar_t wc;
-                w = mbrtowc(&wc, s + *pos, len - *pos, &state);
-                /* if it is one and is printable, put the bytes */
-                if (w && w != (size_t) -2 && w != (size_t) -1 && iswprint(wc)) {
-                    t = s + *pos;
-                    l = w;
-                /* just a single-byte char */
-                } else if (isprint(s[*pos])) {
-                    w = buffer_putnoflush(b, s + *pos, 1);
-                    written += w;
-                    goto next;
-                /* hexa-escaping */
-                } else {
-                    tmp[1] = 'x';
-                    ucharn_fmt(tmp + 2, s + *pos, 1);
-                    t = tmp;
-                    l = 4;
-                    w = 1;
-                }
-            }
-
-            size_t bw = 0;
-            if (!buffer_putall(b, t, l, &bw))
-                return 0;
-            written += bw;
-        }
+        char buf[64];
+        size_t w = 0;
 
-next:
-        *pos += w;
-        if (*pos >= len)
-            return written;
+        escall_fmt(buf, sizeof(buf), s, len, &w, pos);
+        written += buffer_putnoflush(b, buf, w);
+        if (*pos >= len) return written;
 
         if (buffer_isfull(b)) {
             buffer_flush(b);
diff --git a/src/liblimb/esc.h/esc_fmt.c b/src/liblimb/esc.h/esc_fmt.c
new file mode 100644
index 0000000..e179ef3
--- /dev/null
+++ b/src/liblimb/esc.h/esc_fmt.c
@@ -0,0 +1,14 @@
+/* This file is part of limb                           https://lila.oss/limb
+ * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <limb/esc.h>
+
+ssize_t
+esc_fmt(char *dst, size_t dlen, const char *sce, size_t slen)
+{
+    size_t w = 0, r = 0;
+    if (escall_fmt(dst, dlen, sce, slen, &w, &r))
+        return w;
+    else
+        return -1;
+}
diff --git a/src/liblimb/esc.h/esc_scan.c b/src/liblimb/esc.h/esc_scan.c
new file mode 100644
index 0000000..5b5e661
--- /dev/null
+++ b/src/liblimb/esc.h/esc_scan.c
@@ -0,0 +1,14 @@
+/* This file is part of limb                           https://lila.oss/limb
+ * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <limb/esc.h>
+
+ssize_t
+esc_scan(char *dst, size_t dlen, const char *sce, size_t slen)
+{
+    size_t w = 0, r = 0;
+    if (escall_scan(dst, dlen, sce, slen, &w, &r))
+        return w;
+    else
+        return -1;
+}
diff --git a/src/liblimb/esc.h/escall_fmt.c b/src/liblimb/esc.h/escall_fmt.c
new file mode 100644
index 0000000..a95e1bc
--- /dev/null
+++ b/src/liblimb/esc.h/escall_fmt.c
@@ -0,0 +1,76 @@
+/* This file is part of limb                           https://lila.oss/limb
+ * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+#include <wchar.h>
+#include <wctype.h>
+#include <skalibs/fmtscan.h>
+#include <limb/esc.h>
+
+int
+escall_fmt(char *dst, size_t dlen, const char *sce, size_t slen, size_t *w, size_t *r)
+{
+    if (*w > dlen || *r > slen) return (errno = EINVAL, 0);
+
+    while ((!dst || *w < dlen) && *r < slen) {
+        /* printable ASCII chars we can directly put */
+        if (sce[*r] != '\\' && sce[*r] != '"' && sce[*r] >= 32 && sce[*r] <= 126) {
+            if (dst) dst[*w] = sce[*r];
+            ++*w;
+            ++*r;
+        } else {
+            /* basic backslash escaping */
+            if (sce[*r] == '\\' || sce[*r] == '"') {
+                if (*w + 2 > dlen) return (errno = ENOBUFS, 0);
+                if (dst) {
+                    dst[*w] = '\\';
+                    dst[*w + 1] = sce[*r];
+                }
+                *w += 2;
+                *r += 1;
+            /* simple backslash escaping */
+            } else if (sce[*r] >= 7 && sce[*r] <= 13) {
+                if (*w + 2 > dlen) return (errno = ENOBUFS, 0);
+                if (dst) {
+                    const char esc[7] = "abtnvfr";
+                    dst[*w] = '\\';
+                    dst[*w + 1] = esc[sce[*r] - 7];
+                }
+                *w += 2;
+                *r += 1;
+            } else {
+                mbstate_t state = { 0 };
+                /* try to get a multibyte char */
+                wchar_t wc;
+                size_t l = mbrtowc(&wc, sce + *r, slen - *r, &state);
+                /* if it is one and is printable, put the bytes */
+                if (l && l != (size_t) -2 && l != (size_t) -1 && iswprint(wc)) {
+                    if (*w + l > dlen) return (errno = ENOBUFS, 0);
+                    if (dst) memcpy(dst + *w, sce + *r, l);
+                    *w += l;
+                    *r += l;
+                /* just a single-byte char */
+                } else if (isprint(sce[*r])) {
+                    if (dst) dst[*w] = sce[*r];
+                    ++*w;
+                    ++*r;
+                /* hexa-escaping */
+                } else {
+                    if (*w + 4 > dlen) return (errno = ENOBUFS, 0);
+                    if (dst) {
+                        dst[*w] = '\\';
+                        dst[*w + 1] = 'x';
+                        ucharn_fmt(dst + *w + 2, sce + *r, 1);
+                    }
+                    *w += 4;
+                    *r += 1;
+                }
+            }
+        }
+    }
+    if (*r < slen) return (errno = ENOBUFS, 0);
+
+    return 1;
+}
diff --git a/src/liblimb/esc.h/escall_scan.c b/src/liblimb/esc.h/escall_scan.c
new file mode 100644
index 0000000..abe1000
--- /dev/null
+++ b/src/liblimb/esc.h/escall_scan.c
@@ -0,0 +1,45 @@
+/* This file is part of limb                           https://lila.oss/limb
+ * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <errno.h>
+#include <skalibs/fmtscan.h>
+#include <limb/bytestr.h>
+#include <limb/esc.h>
+
+int
+escall_scan(char *dst, size_t dlen, const char *sce, size_t slen, size_t *w, size_t *r)
+{
+    if (*w > dlen || *r > slen) return (errno = EINVAL, 0);
+
+    while ((!dst || *w < dlen) && *r < slen) {
+        if (sce[*r] == '\\') {
+            ++*r;
+            if (sce[*r] == '\\' || sce[*r] == '"') {
+                if (dst) dst[*w] = sce[*r];
+            } else if (sce[*r] == 'x') {
+                ++*r;
+                if (dst) {
+                    char c = fmtscan_num(sce[*r], 16);
+                    if (c >= 16) return (errno = EINVAL, 0);
+                    dst[*w] = c << 4;
+                    c = fmtscan_num(sce[*r + 1], 16);
+                    if (c >= 16) return (errno = EINVAL, 0);
+                    dst[*w] += c;
+                }
+                ++*r;
+            } else {
+                const char esc[7] = "abtnvfr";
+                size_t n = byte_chr(esc, 7, sce[*r]);
+                if (n == 7) return (errno = EINVAL, 0);
+                if (dst) dst[*w] = 7 + n;
+            }
+        } else {
+            if (dst) dst[*w] = sce[*r];
+        }
+        ++*w;
+        ++*r;
+    }
+    if (*r < slen) return (errno = ENOBUFS, 0);
+
+    return 1;
+}
diff --git a/src/liblimb/include/limb/esc.h b/src/liblimb/include/limb/esc.h
new file mode 100644
index 0000000..56e1456
--- /dev/null
+++ b/src/liblimb/include/limb/esc.h
@@ -0,0 +1,15 @@
+/* This file is part of limb                           https://lila.oss/limb
+ * Copyright (C) 2023 Olivier Brunel                          jjk@jjacky.com */
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef LIMB_ESC_H
+#define LIMB_ESC_H
+
+#include <sys/types.h> /* {,s}size_t */
+
+extern int escall_fmt(char *dst, size_t dlen, const char *sce, size_t slen, size_t *w, size_t *r);
+extern int escall_scan(char *dst, size_t dlen, const char *sce, size_t slen, size_t *w, size_t *r);
+
+extern ssize_t esc_fmt(char *dst, size_t dlen, const char *sce, size_t slen);
+extern ssize_t esc_scan(char *dst, size_t dlen, const char *sce, size_t slen);
+
+#endif /* LIMB_ESC_H */
diff --git a/src/mkrabintables/escall_fmt.o b/src/mkrabintables/escall_fmt.o
new file mode 120000
index 0000000..45e3e20
--- /dev/null
+++ b/src/mkrabintables/escall_fmt.o
@@ -0,0 +1 @@
+liblimb/esc.h/escall_fmt.o
\ No newline at end of file