~andyc/oils

e80f6304710a4f190a50699e4f861fdccad96dcb — Andy C 2 months ago 97565d0 master
[demo] URLSearchParams function in JavaScript

YSH has a good core!  I was able to do it in pure YSH, and Eggex has a
good structure.

But it needs polish!  See comments at the top of the file.
1 files changed, 271 insertions(+), 0 deletions(-)

A demo/url-search-params.ysh
A demo/url-search-params.ysh => demo/url-search-params.ysh +271 -0
@@ 0,0 1,271 @@
#!bin/ysh
#
# Usage:
#   demo/url-search-params.ysh <function name>
#
# Tested against JavaScript's URLSearchParams.  Differences:
#
# - JS strings can't represent bytes, so %ff turns into the Unicode replacement char.
#   - YSH turns this into the 0xff byte, denoted as b'\yff'
# - JS accepts '==' as key="" value="="
#   - In YSH, this is a syntax error.
# - On the other hand, both JS and YSH agree that =&=&= is 3 empty key value pairs:
#   [["", ""]
#    ["", ""],
#    ["", ""]]
#
# Evaluation of "the YSH experience":
#
# GOOD:
#
# - Eggex is elegant
#   - This code is structured better than the Python stdlib urlparse.py!
#   - This problem is also hard/ugly in JavaScript.  They use an extra
#     s=>replace() on top of decodeURIComponent()!
# - Task files in YSH basically work!
#   - I think this file has a nice structure
# - It's nice to mix INTERIOR YSH testing and EXTERIOR comparison to node.js 
# - Triple quoted multiline strings are nice!
#
# NEEDS WORK:
#
# - need Vim syntax highlighting!
#   - e.g. multiline '' strings aren't higlighted
# - need pp [x] for debugging
# - need assert [x] for testing
# - task files need completion
#
# - Eggex can use multiline /// syntax
# - Eggex could use "which" match
# - m=>group('lit') sorta bothers me, it should be 
#   - m.group('lit')
#   - $lit - probably!
#   - with vars(m.groupDict()) { ... }
# - Alternative to printf -v probably needed, or at least wrap it in the YSH
#   stdlib
#
# - ERROR messages for URL parsing should bubble up to the user!
#   - USER code should be able to point out to location info for bad escapes
#   like %f or %0z
#   - I guess we just need an idiom for this?  A "class"?

source $LIB_OSH/task-five.sh
#source $LIB_YSH/yblocks.ysh

proc _check (; val) {  # TODO: assert
  if (not val) {
    pp line (val)
    error "Failed: $val"
  }
}

func strFromTwoHex(two_hex) {
  var result
  # TODO: provide alternative to old OSH style!

  # Python style would include something like this
  # var i = int(two_hex, 16)

  printf -v result "\\x$two_hex"
  return (result)
}

const Hex = / [0-9 a-f A-F] /

const Quoted = / <capture !['%+']+ as lit> | <capture '+' as plus> | '%' <capture Hex Hex as two_hex> /

func unquote (s) {
  ### Turn strings with %20 into space, etc.

  #echo
  #echo "unquote $s"

  var pos = 0
  var parts = []
  while (true) {
    var m = s => leftMatch(Quoted, pos=pos)
    if (not m) {
      break
    }

    var lit = m => group('lit')
    var plus = m => group('plus')
    var two_hex = m => group('two_hex')

    var part
    if (lit) {
      #echo "  lit $lit"
      setvar part = lit
    } elif (plus) {
      #echo "  plus $plus"
      setvar part = ' '
    } elif (two_hex) {
      #echo "  two_hex $two_hex"
      #setvar part = two_hex

      setvar part = strFromTwoHex(two_hex)
    }
    call parts->append(part)

    setvar pos = m => end(0)
    #echo
  }
  if (pos !== len(s)) {
    error "Unexpected trailing input in unquote"
  }

  return (join(parts))
}

proc js-decode-part(s) {
  nodejs -e '''

  var encoded = process.argv[1];
  
  // It does not handle +, because is only for query params, not components?
  // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent
  var encoded = encoded.replace(/\+/g, " ")
  
  var j = JSON.stringify(decodeURIComponent(encoded))
  process.stdout.write(j);
  
  ''' $s
}

const PART_CASES = [
  'foo+bar',
  'foo%23%40',
  # empty key, empty value, invalid % , etc.
]

proc test-part() {
  echo hi

  #_check ('foo bar' === unquote('foo+bar'))

  for s in (PART_CASES) {
    js-decode-part $s | json read
    echo 'JS'
    pp line (_reply)

    echo 'YSH'
    = unquote(s)
    echo
    #break
  }
}

#
# Query
#

# JavaScript allows either side of k=v to be empty, so we match that
const Tok = / !['&= ']* /

const Pair = / <capture Tok as key> '=' <capture Tok as value> /

const Pairs = / Pair <capture '&' as sep>? /

func URLSearchParams(s) {
  ### Turn k=v&foo=spam+eggs&k=v into a list of pairs

  # Loop over matches
  var pos = 0
  #echo Pairs=$Pairs

  var pairs = []
  while (true) {
    var m = s => leftMatch(Pairs, pos=pos)
    if (not m) {
      break
    }
    #pp line (m)
    #pp line (m => group(0))
    var k = m => group('key')
    var v = m => group('value')

    #pp line (k)
    #pp line (v)

    call pairs->append([unquote(k), unquote(v)])

    setvar pos = m => end(0)
    #pp line (pos)

    var sep = m => group('sep')
    if (not sep) {
      break
    }
  }
  if (pos !== len(s)) {
    error "Unexpected trailing input in URLSearchParams $pos != $[len(s)]"
  }

  return (pairs)
}

proc js-decode-query(s) {
  nodejs -e '''

  const u = new URLSearchParams(process.argv[1]);
  //console.log(JSON.stringify(u));
  
  var pairs = []
  for (pair of u) {
    pairs.push(pair)
  }
  
  var j = JSON.stringify(pairs);
  
  //console.log(j):
  process.stdout.write(j);
  ''' $s
}

const QUERY_CASES = [
  'k=foo+bar',
  'key=foo%23%40',
  'k=v&foo%23=bar+baz+%24%25&k=v',
  'foo+bar=z',

  # JavaScript converts %ff to the Unicode replacement char - its strings can't represent bytes
  'foo%ffbar=z',

  'missing_val=&k=',

  '=missing_key&=m2',

  # This is valid
  '=&=',
  '=&=&',

  # JavaScript treats = as literal - that seems wrong
  # YSH treating this as an error seems right
  #'==',
]

proc test-query() {
  #_check ('foo bar' === unquote('foo+bar'))

  for s in (QUERY_CASES) {
    echo 'INPUT'
    echo "  $s"

    js-decode-query $s | json read
    echo 'JS'
    pp line (_reply)

    echo 'YSH'
    var pairs = URLSearchParams(s)
    pp line (pairs)

    echo
    #break
  }
}

proc run-tests() {
  devtools/byo.sh test $0
}

task-five "$@"