UTF-8 encoding in pure VCL

Just for fun, here’s UTF-8 encoding in pure VCL.

I wrote this as part of our language tests when we introduced bitwise operators a while ago, but I think it’s also a good example for passing arguments and returning values from user-defined subs.

sub hexdigit(INTEGER var.i) STRING {
  return substr("0123456789abcdef", var.i, 1);
}

# in: var.i - an octet
# out: the same octet in base 16
sub hex(INTEGER var.i) STRING {
  declare local var.lo INTEGER;
  declare local var.hi INTEGER;

  declare local var.c0 STRING;
  declare local var.c1 STRING;

  set var.lo = var.i;
  set var.lo &= 0xf;
  set var.c0 = hexdigit(var.lo);

  set var.hi = var.i;
  set var.hi >>= 4;
  set var.hi &= 0xf;
  set var.c1 = hexdigit(var.hi);

  declare local var.r STRING;
  set var.r = {"%"} + var.c1 + var.c0;
  return var.r;
}

# in:  var.codepoint - unicode codepoint
# out: utf8 byte sequence
sub codepoint(STRING var.codepoint) STRING {
  declare local var.cp INTEGER;

  declare local var.c0 STRING;
  declare local var.c1 STRING;
  declare local var.c2 STRING;
  declare local var.c3 STRING;

  declare local var.tmp INTEGER;

  if (var.codepoint ~ "^0x") {
    set var.cp = std.strtol(var.codepoint, 16);
  } else {
    set var.cp = std.atoi(var.codepoint);
  }

  if (var.cp > 0xd800 && var.cp <= 0xdfff) {
    # invalid
    return "";
  }

  if (var.cp <= 0x7f) {
    set var.c0 = hex(var.cp);
    goto done;
  }

  if (var.cp <= 0x7ff) {
    set var.tmp = var.cp;
    set var.tmp >>= 6;
    set var.tmp += 192;
    set var.c0 = hex(var.tmp);

    set var.tmp = var.cp;
    set var.tmp &= 63;
    set var.tmp += 128;
    set var.c1 = hex(var.tmp);

    goto done;
  }

  if (var.cp <= 0xffff) {
    set var.tmp = var.cp;
    set var.tmp >>= 12;
    set var.tmp += 224;
    set var.c0 = hex(var.tmp);

    set var.tmp = var.cp;
    set var.tmp >>= 6;
    set var.tmp &= 63;
    set var.tmp += 128;
    set var.c1 = hex(var.tmp);

    set var.tmp = var.cp;
    set var.tmp &= 63;
    set var.tmp += 128;
    set var.c2 = hex(var.tmp);

    goto done;
  }

  if (var.cp <= 0x10ffff) {
    set var.tmp = var.cp;
    set var.tmp >>= 18;
    set var.tmp += 240;
    set var.c0 = hex(var.tmp);

    set var.tmp = var.cp;
    set var.tmp >>= 12;
    set var.tmp &= 63;
    set var.tmp += 128;
    set var.c1 = hex(var.tmp);

    set var.tmp = var.cp;
    set var.tmp >>= 6;
    set var.tmp &= 63;
    set var.tmp += 128;
    set var.c2 = hex(var.tmp);

    set var.tmp = var.cp;
    set var.tmp &= 63;
    set var.tmp += 128;
    set var.c3 = hex(var.tmp);

    goto done;
  }

done:

  # borrowing urldecode() for hex decoding
  return urldecode(var.c0 + var.c1 + var.c2 + var.c3);
}

sub vcl_recv {
  error 600;
}

sub vcl_error {
  # a random codepoint in the unicode emoji range
  declare local var.codepoint INTEGER = randomint(0x1F600, 0x1F64F);

  # encode to utf8
  declare local var.utf8 STRING = codepoint(var.codepoint);

  # synthetic response
  set obj.http.content-type = "text/plain; charset=utf-8";
  synthetic var.utf8;
}
2 Likes

what do the curly braces in set var.r = {"%"} + var.c1 + var.c0; do?

In regular ""-quoted string literals, %ab is a hex escape sequence. because it’s supposed to look like url encoding, rather than \xab style C escapes.

{xyz"abc%abcabc"xyz} is a heredoc-style “long string” literal, which doesn’t do %ab hex escaping inside the string.

{"abc%abc"} is a heredoc-style string literal without the heredoc identifier.

So {"%"} is a literal %. Which I could also have written "%25"

The syntax for string literals is documented here: STRING | Fastly Documentation