Files
jsluice/strings.go
2023-06-13 14:55:28 +01:00

251 lines
5.4 KiB
Go

package jsluice
import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
// An item represents a token in a JavaScript string
type item struct {
typ itemType
val string
}
type itemType int
const (
itemString itemType = iota
itemSingleEscape
itemHexEscape
itemOctalEscape
itemUnicodeEscape
itemCodepointEscape
)
func (i item) String() string {
switch i.typ {
case itemString:
return i.val
case itemSingleEscape:
escapes := map[string]string{
"b": "\b",
"f": "\f",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v",
}
if out, exists := escapes[i.val]; exists {
return out
}
return i.val
case itemHexEscape, itemUnicodeEscape, itemCodepointEscape:
num, err := strconv.ParseInt(i.val, 16, 0)
if err != nil {
return i.val
}
return string(rune(num))
case itemOctalEscape:
num, err := strconv.ParseInt(i.val, 8, 0)
if err != nil {
return i.val
}
return string(rune(num))
default:
return i.val
}
}
// a stringLexer maintains the state needed to lex a string
type stringLexer struct {
str string // The input string
start int // The start position (in bytes) of the current token being lexed
pos int // The current position (in bytes) of the rune being looked at
items []item // A slice of tokens that have been emitted
done bool // Flag that's set when we've consumed all of the input
}
func newStringLexer(in string) *stringLexer {
return &stringLexer{
str: in,
start: 0,
pos: 0,
items: make([]item, 0),
done: false,
}
}
// Next returns the next rune in the input string, moving the
// position pointer on by the size of the rune that was decoded.
// For ASCII text this wouldn't be required, but in JavaScript
// source we will encounter many runes that have a length of more
// than one byte
func (s *stringLexer) Next() rune {
if s.pos >= len(s.str) {
s.done = true
return -1
}
r, l := utf8.DecodeRuneInString(s.str[s.pos:])
s.pos += l
return r
}
// Backup moves the position pointer back by the length of the
// previous rune in the input string.
func (s *stringLexer) Backup() {
if s.done || s.pos <= 0 {
return
}
_, l := utf8.DecodeLastRuneInString(s.str[:s.pos])
s.pos -= l
}
// Peek returns the next rune in the input string without advancing
// the position pointer
func (s *stringLexer) Peek() rune {
r := s.Next()
s.Backup()
return r
}
// Emit adds a token of the provided type to the stringLexer's
// internal list of tokens. The start pointer is advanced to the
// current position.
func (s *stringLexer) Emit(t itemType) {
s.items = append(s.items, item{
typ: t,
val: s.str[s.start:s.pos],
})
s.start = s.pos
}
// Ignore moves the start position pointer to the current
// position without emitting a token; effectively ignoring
// the chunk of text between the last token we emitted and now.
func (s *stringLexer) Ignore() {
s.start = s.pos
}
// Accept advances the position pointer only if the next rune
// is in the set of valid runes provided
func (s *stringLexer) Accept(valid string) bool {
if strings.ContainsRune(valid, s.Next()) {
return true
}
s.Backup()
return false
}
// AcceptN advances the position pointer N times, only if
// the next N runes are in the set of valid runes provided
func (s *stringLexer) AcceptN(valid string, n int) bool {
count := 0
for i := 0; i < n; i++ {
if s.Accept(valid) {
count++
}
}
return count == n
}
// AcceptUntil accepts any runes until the rune provided is
// encountered
func (s *stringLexer) AcceptUntil(r rune) {
for s.Next() != r && !s.done {
}
s.Backup()
}
// AcceptRun accepts runes until encountering a rune not
// in the set of valid runes provided
func (s *stringLexer) AcceptRun(valid string) {
for strings.ContainsRune(valid, s.Next()) {
}
s.Backup()
}
// String returns the unescaped representation of the input
// that has been lexed so far. Usually it would only be
// called after all the input has been processed.
func (s *stringLexer) String() string {
out := &strings.Builder{}
for _, i := range s.items {
out.WriteString(i.String())
}
return out.String()
}
// DecodeString accepts a raw string as it might be found in some
// JavaScript source code, and converts any escape sequences. E.g:
// foo\x3dbar -> foo=bar // Hex escapes
// foo\u003Dbar -> foo=bar // Unicode escapes
// foo\u{003D}bar -> foo=bar // Braced unicode escapes
// foo\075bar -> foo=bar // Octal escape
// foo\"bar -> foo"bar // Single character escapes
func DecodeString(in string) string {
in = dequote(in)
l := newStringLexer(in)
validHex := "0123456789abcdefABCDEF"
for !l.done {
l.AcceptUntil('\\')
l.Emit(itemString)
if l.done {
break
}
// Ignore the backslash
l.Next()
l.Ignore()
switch l.Next() {
case 'b', 'f', 'n', 'r', 't', 'v', '\'', '"', '\\':
l.Emit(itemSingleEscape)
case '0':
// It's a \0 (null)
if !unicode.IsDigit(l.Peek()) {
l.Emit(itemSingleEscape)
continue
}
// It's an octal escape
l.AcceptRun("01234567")
l.Emit(itemOctalEscape)
case 'x':
// ignore the x
l.Ignore()
// Exactly 2 hex digits
if l.AcceptN(validHex, 2) {
l.Emit(itemHexEscape)
}
case 'u':
// ignore the u
l.Ignore()
// e.g. \u{00003d}
if l.Accept("{") {
l.Ignore()
l.AcceptRun(validHex)
l.Emit(itemCodepointEscape)
if l.Accept("}") {
l.Ignore()
}
}
// e.g. \u003d
if l.AcceptN(validHex, 4) {
l.Emit(itemUnicodeEscape)
}
}
}
return l.String()
}