251 lines
5.4 KiB
Go
251 lines
5.4 KiB
Go
package jsluice
|
|
|
|
import (
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// An item represents a token in a JavaScript string
|
|
type item struct {
|
|
typ itemType
|
|
val string
|
|
}
|
|
|
|
type itemType int
|
|
|
|
const (
|
|
itemString itemType = iota
|
|
itemSingleEscape
|
|
itemHexEscape
|
|
itemOctalEscape
|
|
itemUnicodeEscape
|
|
itemCodepointEscape
|
|
)
|
|
|
|
func (i item) String() string {
|
|
switch i.typ {
|
|
case itemString:
|
|
return i.val
|
|
case itemSingleEscape:
|
|
escapes := map[string]string{
|
|
"b": "\b",
|
|
"f": "\f",
|
|
"n": "\n",
|
|
"r": "\r",
|
|
"t": "\t",
|
|
"v": "\v",
|
|
}
|
|
|
|
if out, exists := escapes[i.val]; exists {
|
|
return out
|
|
}
|
|
return i.val
|
|
case itemHexEscape, itemUnicodeEscape, itemCodepointEscape:
|
|
num, err := strconv.ParseInt(i.val, 16, 0)
|
|
if err != nil {
|
|
return i.val
|
|
}
|
|
return string(rune(num))
|
|
case itemOctalEscape:
|
|
num, err := strconv.ParseInt(i.val, 8, 0)
|
|
if err != nil {
|
|
return i.val
|
|
}
|
|
return string(rune(num))
|
|
default:
|
|
return i.val
|
|
}
|
|
}
|
|
|
|
// a stringLexer maintains the state needed to lex a string
|
|
type stringLexer struct {
|
|
str string // The input string
|
|
start int // The start position (in bytes) of the current token being lexed
|
|
pos int // The current position (in bytes) of the rune being looked at
|
|
items []item // A slice of tokens that have been emitted
|
|
done bool // Flag that's set when we've consumed all of the input
|
|
}
|
|
|
|
func newStringLexer(in string) *stringLexer {
|
|
return &stringLexer{
|
|
str: in,
|
|
start: 0,
|
|
pos: 0,
|
|
items: make([]item, 0),
|
|
done: false,
|
|
}
|
|
}
|
|
|
|
// Next returns the next rune in the input string, moving the
|
|
// position pointer on by the size of the rune that was decoded.
|
|
// For ASCII text this wouldn't be required, but in JavaScript
|
|
// source we will encounter many runes that have a length of more
|
|
// than one byte
|
|
func (s *stringLexer) Next() rune {
|
|
if s.pos >= len(s.str) {
|
|
s.done = true
|
|
return -1
|
|
}
|
|
|
|
r, l := utf8.DecodeRuneInString(s.str[s.pos:])
|
|
s.pos += l
|
|
return r
|
|
}
|
|
|
|
// Backup moves the position pointer back by the length of the
|
|
// previous rune in the input string.
|
|
func (s *stringLexer) Backup() {
|
|
if s.done || s.pos <= 0 {
|
|
return
|
|
}
|
|
_, l := utf8.DecodeLastRuneInString(s.str[:s.pos])
|
|
s.pos -= l
|
|
}
|
|
|
|
// Peek returns the next rune in the input string without advancing
|
|
// the position pointer
|
|
func (s *stringLexer) Peek() rune {
|
|
r := s.Next()
|
|
s.Backup()
|
|
return r
|
|
}
|
|
|
|
// Emit adds a token of the provided type to the stringLexer's
|
|
// internal list of tokens. The start pointer is advanced to the
|
|
// current position.
|
|
func (s *stringLexer) Emit(t itemType) {
|
|
s.items = append(s.items, item{
|
|
typ: t,
|
|
val: s.str[s.start:s.pos],
|
|
})
|
|
s.start = s.pos
|
|
}
|
|
|
|
// Ignore moves the start position pointer to the current
|
|
// position without emitting a token; effectively ignoring
|
|
// the chunk of text between the last token we emitted and now.
|
|
func (s *stringLexer) Ignore() {
|
|
s.start = s.pos
|
|
}
|
|
|
|
// Accept advances the position pointer only if the next rune
|
|
// is in the set of valid runes provided
|
|
func (s *stringLexer) Accept(valid string) bool {
|
|
if strings.ContainsRune(valid, s.Next()) {
|
|
return true
|
|
}
|
|
s.Backup()
|
|
return false
|
|
}
|
|
|
|
// AcceptN advances the position pointer N times, only if
|
|
// the next N runes are in the set of valid runes provided
|
|
func (s *stringLexer) AcceptN(valid string, n int) bool {
|
|
count := 0
|
|
for i := 0; i < n; i++ {
|
|
if s.Accept(valid) {
|
|
count++
|
|
}
|
|
}
|
|
return count == n
|
|
}
|
|
|
|
// AcceptUntil accepts any runes until the rune provided is
|
|
// encountered
|
|
func (s *stringLexer) AcceptUntil(r rune) {
|
|
for s.Next() != r && !s.done {
|
|
}
|
|
s.Backup()
|
|
}
|
|
|
|
// AcceptRun accepts runes until encountering a rune not
|
|
// in the set of valid runes provided
|
|
func (s *stringLexer) AcceptRun(valid string) {
|
|
for strings.ContainsRune(valid, s.Next()) {
|
|
}
|
|
s.Backup()
|
|
}
|
|
|
|
// String returns the unescaped representation of the input
|
|
// that has been lexed so far. Usually it would only be
|
|
// called after all the input has been processed.
|
|
func (s *stringLexer) String() string {
|
|
out := &strings.Builder{}
|
|
for _, i := range s.items {
|
|
out.WriteString(i.String())
|
|
}
|
|
return out.String()
|
|
}
|
|
|
|
// DecodeString accepts a raw string as it might be found in some
|
|
// JavaScript source code, and converts any escape sequences. E.g:
|
|
// foo\x3dbar -> foo=bar // Hex escapes
|
|
// foo\u003Dbar -> foo=bar // Unicode escapes
|
|
// foo\u{003D}bar -> foo=bar // Braced unicode escapes
|
|
// foo\075bar -> foo=bar // Octal escape
|
|
// foo\"bar -> foo"bar // Single character escapes
|
|
func DecodeString(in string) string {
|
|
in = dequote(in)
|
|
l := newStringLexer(in)
|
|
|
|
validHex := "0123456789abcdefABCDEF"
|
|
|
|
for !l.done {
|
|
l.AcceptUntil('\\')
|
|
l.Emit(itemString)
|
|
|
|
if l.done {
|
|
break
|
|
}
|
|
|
|
// Ignore the backslash
|
|
l.Next()
|
|
l.Ignore()
|
|
|
|
switch l.Next() {
|
|
case 'b', 'f', 'n', 'r', 't', 'v', '\'', '"', '\\':
|
|
l.Emit(itemSingleEscape)
|
|
case '0':
|
|
// It's a \0 (null)
|
|
if !unicode.IsDigit(l.Peek()) {
|
|
l.Emit(itemSingleEscape)
|
|
continue
|
|
}
|
|
// It's an octal escape
|
|
l.AcceptRun("01234567")
|
|
l.Emit(itemOctalEscape)
|
|
case 'x':
|
|
// ignore the x
|
|
l.Ignore()
|
|
|
|
// Exactly 2 hex digits
|
|
if l.AcceptN(validHex, 2) {
|
|
l.Emit(itemHexEscape)
|
|
}
|
|
case 'u':
|
|
// ignore the u
|
|
l.Ignore()
|
|
|
|
// e.g. \u{00003d}
|
|
if l.Accept("{") {
|
|
l.Ignore()
|
|
l.AcceptRun(validHex)
|
|
l.Emit(itemCodepointEscape)
|
|
if l.Accept("}") {
|
|
l.Ignore()
|
|
}
|
|
}
|
|
|
|
// e.g. \u003d
|
|
if l.AcceptN(validHex, 4) {
|
|
l.Emit(itemUnicodeEscape)
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return l.String()
|
|
}
|