Import of jsurls

This commit is contained in:
Tom Hudson
2022-08-30 12:38:49 +01:00
commit 605d0760ea
25 changed files with 1943 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*.sw*
cpu.pprof

8
README.mkd Normal file
View File

@@ -0,0 +1,8 @@
# jsluice
A Go package and tool for extracting URLs, secrets, and other interesting data from JavaScript files.
Uses [go-tree-sitter](https://github.com/smacker/go-tree-sitter) for parsing.

31
analyzer.go Normal file
View File

@@ -0,0 +1,31 @@
package jsurls
import (
sitter "github.com/smacker/go-tree-sitter"
"github.com/smacker/go-tree-sitter/javascript"
)
// Analyzer could be considered the core type of jsluice. It wraps
// the parse tree for a JavaScript file and provides mechanisms to
// extract URLs, secrets etc
type Analyzer struct {
source []byte
parser *sitter.Parser
urlMatchers []URLMatcher
rootNode *sitter.Node
}
// NewAnalyzer accepts a slice of bytes representing some JavaScript
// source code and returns a pointer to a new Analyzer
func NewAnalyzer(source []byte) *Analyzer {
parser := sitter.NewParser()
parser.SetLanguage(javascript.GetLanguage())
tree := parser.Parse(nil, source)
return &Analyzer{
source: source,
parser: parser,
urlMatchers: AllURLMatchers(),
rootNode: tree.RootNode(),
}
}

21
analyzer_test.go Normal file
View File

@@ -0,0 +1,21 @@
package jsurls
import "testing"
func TestAnalyzerBasic(t *testing.T) {
a := NewAnalyzer([]byte(`
function foo(){
document.location = "/logout"
}
`))
urls := a.GetURLs()
if len(urls) != 1 {
t.Errorf("Expected exactly 1 URL; got %d", len(urls))
}
if urls[0].URL != "/logout" {
t.Errorf("Expected first URL to be '/logout'; got %s", urls[0].URL)
}
}

1
cmd/jsurls-sinks/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
jsurls-sinks

View File

@@ -0,0 +1,6 @@
# jsurls-sinks
Development tool for finding places in JavaScript files that use URLs.
The idea is you can run this against a whole bunch of JS files and it will spit out places that
might be good to add to the main `jsurls` matchers.

194
cmd/jsurls-sinks/main.go Normal file
View File

@@ -0,0 +1,194 @@
package main
import (
"flag"
"fmt"
"io/ioutil"
"log"
"regexp"
"strings"
sitter "github.com/smacker/go-tree-sitter"
"github.com/smacker/go-tree-sitter/javascript"
)
func main() {
reWhitespace := regexp.MustCompile(`\s{2,}`)
reJSName := regexp.MustCompile(`^[a-zA-Z0-9_$.-]+$`)
flag.Parse()
source, err := ioutil.ReadFile(flag.Arg(0))
if err != nil {
log.Fatal(err)
}
parser := sitter.NewParser()
parser.SetLanguage(javascript.GetLanguage())
tree := parser.Parse(nil, source)
root := tree.RootNode()
enter := func(n *sitter.Node) {
switch n.Type() {
case "assignment_expression":
left := n.ChildByFieldName("left")
right := n.ChildByFieldName("right")
if left == nil || right == nil {
return
}
rightContent := right.Content(source)
if !startsWithString(rightContent) {
return
}
rightContent = reWhitespace.ReplaceAllString(rightContent, " ")
rightStr := dequote(right.Content(source))
if couldBePath(rightStr) {
fmt.Printf("%s (assignment)\n", left.Content(source))
}
case "call_expression":
callName := n.ChildByFieldName("function").Content(source)
// It's common to find things like immediately called anonymous functions
// in JS source, and we don't care about those because we could never match
// on them
if !reJSName.MatchString(callName) {
return
}
arguments := n.ChildByFieldName("arguments")
if arguments == nil {
return
}
// we want to iterate over the arguments and find
// any that look like a url
c := sitter.NewTreeCursor(arguments)
defer c.Close()
// no args
if !c.GoToFirstChild() {
return
}
foundPath := false
position := 0
for {
arg := c.CurrentNode()
if arg == nil {
break
}
// named args only (i.e. don't count commas etc)
if arg.IsNamed() {
argContent := arg.Content(source)
if startsWithString(argContent) && couldBePath(dequote(argContent)) {
foundPath = true
break
}
position++
}
if !c.GoToNextSibling() {
break
}
}
if foundPath {
fmt.Printf("%s (arg %d)\n", callName, position)
}
}
}
queryNodes(root, enter)
}
func startsWithString(in string) bool {
if len(in) < 2 {
return false
}
p := in[0:1]
if p == `"` || p == "'" || p == "`" {
return true
}
return false
}
func couldBePath(in string) bool {
if (strings.HasPrefix(in, "http:") && len(in) > 7) ||
(strings.HasPrefix(in, "https:") && len(in) > 8) ||
(strings.HasPrefix(in, "/") && len(in) > 3) ||
(strings.HasPrefix(in, "./") && len(in) > 4) {
return true
}
return false
}
func queryNodes(n *sitter.Node, enter func(*sitter.Node)) {
query, err := sitter.NewQuery(
[]byte("[(assignment_expression) (call_expression)] @matches"),
javascript.GetLanguage(),
)
if err != nil {
log.Fatal(err)
}
qc := sitter.NewQueryCursor()
defer qc.Close()
qc.Exec(query, n)
for {
match, exists := qc.NextMatch()
if !exists || match == nil {
break
}
for _, capture := range match.Captures {
enter(capture.Node)
}
}
}
func walk(n *sitter.Node, enter func(*sitter.Node)) {
c := sitter.NewTreeCursor(n)
defer c.Close()
// walkies
recurse := true
for {
// descend into the tree
if recurse && c.GoToFirstChild() {
recurse = true
enter(c.CurrentNode())
continue
}
// move sideways
if c.GoToNextSibling() {
recurse = true
enter(c.CurrentNode())
continue
}
// climb back up the tree, but make sure we don't descend right back to where we were
if c.GoToParent() {
recurse = false
continue
}
break
}
}
func dequote(in string) string {
return strings.Trim(in, "'\"`")
}

1
cmd/jsurls/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
jsurls

77
cmd/jsurls/README.mkd Normal file
View File

@@ -0,0 +1,77 @@
# jsurls
Extract URLs and their parameters from JavaScript files. Uses [go-tree-sitter](https://github.com/smacker/go-tree-sitter) for parsing.
## Install
Run `go install` in this directory, or `go install github.com/bishopfoxmss/jsurls/cmd/jsurls@latest`
If you want the `go install github.com/bish...` command to work, you'll need to make sure you have configured git and Go
to behave properly. In your `~/.gitconfig` you need this to make sure clones are done with SSH:
```
[url "git@github.com:"]
insteadOf = https://github.com/
```
You also need to tell Go not to use their public proxy for packages under `github.com/bishopfoxmss/`.
Run this, and/or put this in your `~/.bashrc` or equivalent:
```
go env -w GOPRIVATE='github.com/bishopfoxmss/*'
```
## Usage
Regular usage is with `jsurls <filename>`. The output is a JSON stream, so you'll probably
want to pipe to `jq` for formatting in most cases:
```
▶ jsurls testdata/jquery-post.js | jq
{
"url": "demo_test_post.asp",
"params": [
"name",
"city"
],
"method": "POST",
"type": "$.post"
}
{
"url": "/logout.php",
"params": [
"redirect"
],
"method": "GET",
"type": "$.get"
}
```
You can print the tree for a JS file with the `--tree`/`-t` flag. This is a useful reference when writing matchers:
```
▶ jsurls testdata/hello.js --tree
program
expression_statement
call_expression
function: member_expression
object: identifier (console)
property: property_identifier (log)
arguments: arguments
string ("Hello, world!")
```
You can see the source code for each match with the `--include-source`/`-i` flag:
```
▶ jsurls testdata/jquery-post.js --include-source | head -n1 | jq .source -r
$.post("demo_test_post.asp",
{
name: "Donald Duck",
city: "Duckburg"
},
function(data, status){
alert("Data: " + data + "\nStatus: " + status);
document.location = data.nextURL
})
```

148
cmd/jsurls/main.go Normal file
View File

@@ -0,0 +1,148 @@
package main
// Extract URLs and related stuff out of JavaScript files
import (
"bufio"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/url"
"os"
"strings"
"sync"
"github.com/bishopfoxmss/jsurls"
"github.com/pkg/profile"
flag "github.com/spf13/pflag"
)
func main() {
var treeMode bool
flag.BoolVarP(&treeMode, "tree", "t", false, "Just print the tree for the provided file")
var includeSource bool
flag.BoolVarP(&includeSource, "include-source", "i", false, "Include the source code where the URL was found")
var ignoreStrings bool
flag.BoolVar(&ignoreStrings, "ignore-strings", false, "Ignore matches from string literals")
var includeFilename bool
flag.BoolVar(&includeFilename, "include-filename", false, "Include the filename of the matched file in the output")
var profileMode bool
flag.BoolVar(&profileMode, "profile", false, "Profile cpu usage and save a cpu.pprof file in the current dir")
var concurrency int
flag.IntVarP(&concurrency, "concurrency", "c", 1, "Number of files to process concurrently")
var resolve string
flag.StringVarP(&resolve, "resolve", "r", "", "Resolve relative paths using the absolute URL provided")
flag.Parse()
if profileMode {
defer profile.Start(profile.ProfilePath(".")).Stop()
}
var resolveURL *url.URL
var err error
if resolve != "" {
resolveURL, err = url.Parse(resolve)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to parse resolve URL: %s\n", err)
return
}
}
var input io.Reader = os.Stdin
if flag.Arg(0) != "" {
input = strings.NewReader(
strings.Join(flag.Args(), "\n"),
)
}
wg := sync.WaitGroup{}
jobs := make(chan string)
matches := make(chan *jsurls.Match)
for i := 0; i < concurrency; i++ {
wg.Add(1)
go func() {
extractor := jsurls.NewExtractor()
for filename := range jobs {
source, err := ioutil.ReadFile(filename)
if err != nil {
fmt.Fprintf(os.Stderr, "%s", err)
continue
}
// print just the tree and stop
if treeMode {
fmt.Printf("%s:\n", filename)
jsurls.PrintTree(source)
continue
}
for _, m := range extractor.GetMatches(source) {
m.Filename = filename
matches <- m
}
}
wg.Done()
}()
}
// read jobs from the input reader, send on jobs channel, close jobs channel
go func() {
sc := bufio.NewScanner(input)
for sc.Scan() {
filename := sc.Text()
jobs <- filename
}
close(jobs)
wg.Wait()
close(matches)
}()
// read and filter the results
for m := range matches {
if ignoreStrings && m.Type == "stringLiteral" {
continue
}
// remove filename if the user doesn't want it
if !includeFilename {
m.Filename = ""
}
// remove any souce if we don't want to display it
if !includeSource {
m.Source = ""
}
if resolveURL != nil {
parsed, err := url.Parse(m.URL)
if err == nil {
m.URL = resolveURL.ResolveReference(parsed).String()
}
}
j, err := json.Marshal(m)
if err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
continue
}
fmt.Printf("%s\n", j)
}
}

1
cmd/treehugger/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
treehugger

64
cmd/treehugger/main.go Normal file
View File

@@ -0,0 +1,64 @@
package main
import (
"bufio"
"flag"
"fmt"
"io/ioutil"
"log"
"os"
sitter "github.com/smacker/go-tree-sitter"
"github.com/smacker/go-tree-sitter/javascript"
)
func main() {
flag.Parse()
queryStr := flag.Arg(0)
parser := sitter.NewParser()
parser.SetLanguage(javascript.GetLanguage())
sc := bufio.NewScanner(os.Stdin)
for sc.Scan() {
source, err := ioutil.ReadFile(sc.Text())
if err != nil {
fmt.Fprintf(os.Stderr, "error opening file: %s\n", err)
continue
}
enter := func(n *sitter.Node) {
content := n.Content(source)
fmt.Println(content)
}
tree := parser.Parse(nil, source)
root := tree.RootNode()
query(root, queryStr, enter)
}
}
func query(n *sitter.Node, queryStr string, enter func(*sitter.Node)) {
q, err := sitter.NewQuery([]byte(queryStr), javascript.GetLanguage())
if err != nil {
log.Fatal(err)
}
qc := sitter.NewQueryCursor()
defer qc.Close()
qc.Exec(q, n)
for {
match, exists := qc.NextMatch()
if !exists || match == nil {
break
}
for _, capture := range match.Captures {
enter(capture.Node)
}
}
}

10
go.mod Normal file
View File

@@ -0,0 +1,10 @@
module github.com/bishopfoxmss/jsluice
go 1.18
require (
github.com/pkg/profile v1.6.0 // indirect
github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/exp v0.0.0-20220706164943-b4a6d9510983 // indirect
)

24
go.sum Normal file
View File

@@ -0,0 +1,24 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pkg/profile v1.6.0 h1:hUDfIISABYI59DyeB3OTay/HxSRwTQ8rB/H83k6r5dM=
github.com/pkg/profile v1.6.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/smacker/go-tree-sitter v0.0.0-20220421092837-ec55f7cfeaf4 h1:UFOHRX5nrxNCVORhicjy31nzSVt9rEjf/YRcx2Dc3MM=
github.com/smacker/go-tree-sitter v0.0.0-20220421092837-ec55f7cfeaf4/go.mod h1:EiUuVMUfLQj8Sul+S8aKWJwQy7FRYnJCO2EWzf8F5hk=
github.com/smacker/go-tree-sitter v0.0.0-20220623130553-1191a8204295 h1:z1lMT/t6SS2A1nYHqcn/61C4uJptpdTyWGnh2P7xBpE=
github.com/smacker/go-tree-sitter v0.0.0-20220623130553-1191a8204295/go.mod h1:q99oHDsbP0xRwmn7Vmob8gbSMNyvJ83OauXPSuHQuKE=
github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11 h1:l4ch+twh4vEZ5VDPyiqC/6h8BhGWHiDxdFRN4M/ZAck=
github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11/go.mod h1:q99oHDsbP0xRwmn7Vmob8gbSMNyvJ83OauXPSuHQuKE=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.4/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
golang.org/x/exp v0.0.0-20220706164943-b4a6d9510983 h1:sUweFwmLOje8KNfXAVqGGAsmgJ/F8jJ6wBLJDt4BTKY=
golang.org/x/exp v0.0.0-20220706164943-b4a6d9510983/go.mod h1:Kr81I6Kryrl9sr8s2FK3vxD90NdsKWRuOIl2O4CvYbA=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

74
maybeurl.go Normal file
View File

@@ -0,0 +1,74 @@
package jsurls
import (
"net/url"
"strings"
)
var fileExtensions set
func init() {
fileExtensions = newSet([]string{
"js", "css", "html", "htm", "xhtml", "xlsx",
"xls", "docx", "doc", "pdf", "rss", "xml",
"php", "phtml", "asp", "aspx", "asmx", "ashx",
"cgi", "pl", "rb", "py", "do", "jsp",
"jspa", "json", "jsonp", "txt",
})
}
func MaybeURL(in string) bool {
// This should eliminate a pretty big percentage of
// string literals that we find, and avoid spending
// the resources on parsing them as URLs
if !strings.ContainsAny(in, "/?") {
return false
}
// We want to be fairly restrictive to cut out things
// like regex strings, blocks of HTML etc. We will miss
// a handful of URLs this way, but that's probably
// better than spitting out a ton of false-positives
if strings.ContainsAny(in, " ()!<>'\"`{}^$,") {
return false
}
// Let's attempt to parse it as a URL, so we can
// do some analysis on the individual parts
u, err := url.Parse(in)
if err != nil {
return false
}
// Valid-scheme?
if u.Scheme != "" {
s := strings.ToLower(u.Scheme)
if s != "http" && s != "https" {
return false
}
}
// Valid-looking hostname?
if len(strings.Split(u.Hostname(), ".")) > 1 {
return true
}
// Valid query string with at least one value?
for _, vv := range u.Query() {
if len(vv) > 0 && len(vv[0]) > 0 {
return true
}
}
// Known file extensions is the last thing we want to
// check so if there's no dot then it's a no from us.
if !strings.ContainsAny(u.Path, ".") {
return false
}
parts := strings.Split(u.Path, ".")
ext := parts[len(parts)-1]
return fileExtensions.Contains(ext)
}

39
maybeurl_test.go Normal file
View File

@@ -0,0 +1,39 @@
package jsurls
import (
"testing"
)
func TestMaybeURL(t *testing.T) {
cases := []struct {
in string
expected bool
}{
{"https://example.com", true},
{"https://example.net/api/v1", true},
{"HTTP://example.net/api/v1", true},
{"application/json", false},
{"text/plain", false},
{"//example.org", true},
{"example.org", false},
{"foo?id=123", true},
{"Who? Me?", false},
{"foo.php?id", true},
{"foo.lolno?id", false},
{"/foo/bar.html", true},
{"./foo/bar.html", true},
{`~[A-Z](?=[/|([{\u003c\\\"'])`, false},
// These might look like paths to humans, but we couldn't
// be confident enough about them programmatically
{"./", false},
{"foo/bar", false},
}
for _, c := range cases {
actual := MaybeURL(c.in)
if actual != c.expected {
t.Errorf("want %t for MaybeURL(%s); have %t", c.expected, c.in, actual)
}
}
}

112
objects.go Normal file
View File

@@ -0,0 +1,112 @@
package jsurls
import (
"strings"
sitter "github.com/smacker/go-tree-sitter"
)
type object struct {
node *sitter.Node
source []byte
}
func newObject(n *sitter.Node, source []byte) object {
return object{
node: n,
source: source,
}
}
func (o object) asMap() map[string]string {
out := make(map[string]string, 0)
if !o.hasValidNode() {
return out
}
for _, k := range o.getKeys() {
out[k] = o.getString(k, "")
}
return out
}
func (o object) hasValidNode() bool {
return o.node != nil && o.node.Type() == "object"
}
func (o object) getNodeFunc(fn func(key string) bool) *sitter.Node {
if !o.hasValidNode() {
return nil
}
count := int(o.node.NamedChildCount())
for i := 0; i < count; i++ {
pair := o.node.NamedChild(i)
if pair.Type() != "pair" {
continue
}
if !fn(dequote(content(pair.ChildByFieldName("key"), o.source))) {
continue
}
return pair.ChildByFieldName("value")
}
return nil
}
func (o object) getNode(key string) *sitter.Node {
return o.getNodeFunc(func(candidate string) bool {
return key == candidate
})
}
func (o object) getNodeI(key string) *sitter.Node {
key = strings.ToLower(key)
return o.getNodeFunc(func(candidate string) bool {
return key == strings.ToLower(candidate)
})
}
func (o object) getKeys() []string {
out := make([]string, 0)
if !o.hasValidNode() {
return out
}
count := int(o.node.NamedChildCount())
for i := 0; i < count; i++ {
pair := o.node.NamedChild(i)
if pair.Type() != "pair" {
continue
}
key := dequote(content(pair.ChildByFieldName("key"), o.source))
out = append(out, key)
}
return out
}
func (o object) getObject(key string) object {
return newObject(o.getNode(key), o.source)
}
func (o object) getString(key, defaultVal string) string {
value := o.getNode(key)
if value == nil || value.Type() != "string" {
return defaultVal
}
return dequote(content(value, o.source))
}
func (o object) getStringI(key, defaultVal string) string {
value := o.getNodeI(key)
if value == nil || value.Type() != "string" {
return defaultVal
}
return dequote(content(value, o.source))
}

16
set.go Normal file
View File

@@ -0,0 +1,16 @@
package jsurls
type set map[string]any
func newSet(items []string) set {
s := make(set)
for _, item := range items {
s[item] = struct{}{}
}
return s
}
func (s set) Contains(item string) bool {
_, exists := s[item]
return exists
}

215
strings.go Normal file
View File

@@ -0,0 +1,215 @@
package jsurls
import (
"strconv"
"strings"
"unicode"
"unicode/utf8"
)
type item struct {
typ itemType
val string
}
type itemType int
const (
itemString itemType = iota
itemSingleEscape
itemHexEscape
itemOctalEscape
itemUnicodeEscape
itemCodepointEscape
)
func (i item) String() string {
switch i.typ {
case itemString:
return i.val
case itemSingleEscape:
escapes := map[string]string{
"b": "\b",
"f": "\f",
"n": "\n",
"r": "\r",
"t": "\t",
"v": "\v",
}
if out, exists := escapes[i.val]; exists {
return out
}
return i.val
case itemHexEscape, itemUnicodeEscape, itemCodepointEscape:
num, err := strconv.ParseInt(i.val, 16, 0)
if err != nil {
return i.val
}
return string(rune(num))
case itemOctalEscape:
num, err := strconv.ParseInt(i.val, 8, 0)
if err != nil {
return i.val
}
return string(rune(num))
default:
return i.val
}
}
type stringLexer struct {
str string
start int
pos int
items []item
done bool
}
func newStringLexer(in string) *stringLexer {
return &stringLexer{
str: in,
start: 0,
pos: 0,
items: make([]item, 0),
done: false,
}
}
func (s *stringLexer) Next() rune {
if s.pos >= len(s.str) {
s.done = true
return -1
}
r, l := utf8.DecodeRuneInString(s.str[s.pos:])
s.pos += l
return r
}
func (s *stringLexer) Backup() {
if s.done || s.pos <= 0 {
return
}
_, l := utf8.DecodeLastRuneInString(s.str[:s.pos])
s.pos -= l
}
func (s *stringLexer) Peek() rune {
r := s.Next()
s.Backup()
return r
}
func (s *stringLexer) Emit(t itemType) {
s.items = append(s.items, item{
typ: t,
val: s.str[s.start:s.pos],
})
s.start = s.pos
}
func (s *stringLexer) Ignore() {
s.start = s.pos
}
func (s *stringLexer) Accept(valid string) bool {
if strings.ContainsRune(valid, s.Next()) {
return true
}
s.Backup()
return false
}
func (s *stringLexer) AcceptN(valid string, n int) bool {
count := 0
for i := 0; i < n; i++ {
if s.Accept(valid) {
count++
}
}
return count == n
}
func (s *stringLexer) AcceptUntil(r rune) {
for s.Next() != r && !s.done {
}
s.Backup()
}
func (s *stringLexer) AcceptRun(valid string) {
for strings.ContainsRune(valid, s.Next()) {
}
s.Backup()
}
func (s *stringLexer) String() string {
out := &strings.Builder{}
for _, i := range s.items {
out.WriteString(i.String())
}
return out.String()
}
func DecodeString(in string) string {
in = dequote(in)
l := newStringLexer(in)
validHex := "0123456789abcdefABCDEF"
for !l.done {
l.AcceptUntil('\\')
l.Emit(itemString)
if l.done {
break
}
// Ignore the backslash
l.Next()
l.Ignore()
switch l.Next() {
case 'b', 'f', 'n', 'r', 't', 'v', '\'', '"', '\\':
l.Emit(itemSingleEscape)
case '0':
// It's a \0 (null)
if !unicode.IsDigit(l.Peek()) {
l.Emit(itemSingleEscape)
continue
}
// It's an octal escape
l.AcceptRun("01234567")
l.Emit(itemOctalEscape)
case 'x':
// ignore the x
l.Ignore()
// Exactly 2 hex digits
if l.AcceptN(validHex, 2) {
l.Emit(itemHexEscape)
}
case 'u':
// ignore the u
l.Ignore()
// e.g. \u{00003d}
if l.Accept("{") {
l.Ignore()
l.AcceptRun(validHex)
l.Emit(itemCodepointEscape)
if l.Accept("}") {
l.Ignore()
}
}
// e.g. \u003d
if l.AcceptN(validHex, 4) {
l.Emit(itemUnicodeEscape)
}
}
}
return l.String()
}

82
strings_test.go Normal file
View File

@@ -0,0 +1,82 @@
package jsurls
import (
"testing"
)
func TestStringDecode(t *testing.T) {
cases := []struct {
in string
expected string
}{
// middle
{`"foo bar"`, `foo bar`},
{`"foo\\bar"`, `foo\bar`},
{`"foo\"bar"`, `foo"bar`},
{`"foo\'bar"`, `foo'bar`},
{`"foo\075bar"`, `foo=bar`},
{`"foo\tbar"`, "foo\tbar"},
{`"foo\vbar"`, "foo\vbar"},
{`"foo\u003dbar"`, "foo=bar"},
{`"foo\u{00000000003d}bar"`, "foo=bar"},
// end
{`"foo\075"`, `foo=`},
{`"foo\x3d"`, `foo=`},
{`"foo\\"`, `foo\`},
// start
{`"\075foo"`, `=foo`},
{`"\x3dfoo"`, `=foo`},
{`"\\foo"`, `\foo`},
// pairs
{`"\075\x3d"`, `==`},
{`"\u{00000003d}\x3d"`, `==`},
// Invalid
{`"\poo"`, `poo`},
{`"\u{0003doops"`, `=oops`},
// real-world
{`"/help/doc/user_ed.jsp?loc\x3dhelp\x26target\x3d"`, "/help/doc/user_ed.jsp?loc=help&target="},
}
for _, c := range cases {
actual := DecodeString(c.in)
if c.expected != actual {
t.Errorf("Want %s for DecodeString(%s); have %s", c.expected, c.in, actual)
}
}
}
func BenchmarkDecodeString(b *testing.B) {
inputs := []string{
`"foo bar"`,
`"foo\\bar"`,
`"foo\"bar"`,
`"foo\'bar"`,
`"foo\075bar"`,
`"foo\tbar"`,
`"foo\vbar"`,
`"foo\u003dbar"`,
`"foo\u{00000000003d}bar"`,
`"foo\075"`,
`"foo\x3d"`,
`"foo\\"`,
`"\075foo"`,
`"\x3dfoo"`,
`"\\foo"`,
`"\075\x3d"`,
`"\u{00000003d}\x3d"`,
`"\poo"`,
`"\u{0003doops"`,
`"/help/doc/user_ed.jsp?loc\x3dhelp\x26target\x3d"`,
}
for i := 0; i < b.N; i++ {
for _, input := range inputs {
_ = DecodeString(input)
}
}
}

202
tree.go Normal file
View File

@@ -0,0 +1,202 @@
package jsurls
import (
"fmt"
"strings"
sitter "github.com/smacker/go-tree-sitter"
"github.com/smacker/go-tree-sitter/javascript"
)
// nil-safe wrapper around calling node.Content(source)
func content(n *sitter.Node, source []byte) string {
if n == nil {
return ""
}
return n.Content(source)
}
func isStringy(n *sitter.Node, source []byte) bool {
if n.Type() == "string" {
return true
}
c := content(n, source)
if len(c) == 0 {
return false
}
switch c[0:0] {
case `"`, "'", "`":
return true
default:
return false
}
}
func hasDescendantOfType(n *sitter.Node, t string) bool {
if n == nil {
return false
}
// node is provided type exactly
if n.Type() == t {
return true
}
hasType := false
enter := func(n *sitter.Node) {
if n.Type() == t {
hasType = true
}
}
walk(n, enter)
return hasType
}
// cleanURL takes a node representing a URL and attempts to make it
// at least somewhat easily parseable. It's common to build URLs out
// of variables and function calls so we want to turn something like:
//
// './upload.php?profile='+res.id+'&show='+$('.participate_modal_container').attr('data-val')
//
// Into something more like:
//
// ./upload.php?profile=EXPR&show=EXPR
//
func cleanURL(n *sitter.Node, source []byte) string {
if n == nil {
return ""
}
switch n.Type() {
case "binary_expression":
return fmt.Sprintf(
"%s%s",
cleanURL(n.ChildByFieldName("left"), source),
cleanURL(n.ChildByFieldName("right"), source),
)
case "string":
return dequote(content(n, source))
default:
return "EXPR"
}
}
func dequote(in string) string {
return strings.Trim(in, "'\"`")
}
func query(n *sitter.Node, query string, enter func(*sitter.Node)) {
q, err := sitter.NewQuery(
[]byte(query),
javascript.GetLanguage(),
)
if err != nil {
return
}
qc := sitter.NewQueryCursor()
defer qc.Close()
qc.Exec(q, n)
for {
match, exists := qc.NextMatch()
if !exists || match == nil {
break
}
for _, capture := range match.Captures {
enter(capture.Node)
}
}
}
func walk(n *sitter.Node, enter func(*sitter.Node)) {
c := sitter.NewTreeCursor(n)
defer c.Close()
// walkies
recurse := true
for {
// descend into the tree
if recurse && c.GoToFirstChild() {
recurse = true
enter(c.CurrentNode())
continue
}
// move sideways
if c.GoToNextSibling() {
recurse = true
enter(c.CurrentNode())
continue
}
// climb back up the tree, but make sure we don't descend right back to where we were
if c.GoToParent() {
recurse = false
continue
}
break
}
}
func PrintTree(source []byte) {
parser := sitter.NewParser()
parser.SetLanguage(javascript.GetLanguage())
tree := parser.Parse(nil, source)
root := tree.RootNode()
PrettyPrint(root, source)
}
func PrettyPrint(n *sitter.Node, source []byte) {
c := sitter.NewTreeCursor(n)
defer c.Close()
// walkies
depth := 0
recurse := true
for {
if recurse && c.CurrentNode().IsNamed() {
fieldName := c.CurrentFieldName()
if fieldName != "" {
fieldName += ": "
}
contentStr := ""
if c.CurrentNode().ChildCount() == 0 || c.CurrentNode().Type() == "string" {
contentStr = fmt.Sprintf(" (%s)", content(c.CurrentNode(), source))
}
fmt.Printf("%s%s%s%s\n", strings.Repeat(" ", depth), fieldName, c.CurrentNode().Type(), contentStr)
}
// descend into the tree
if recurse && c.GoToFirstChild() {
recurse = true
depth++
continue
}
// move sideways
if c.GoToNextSibling() {
recurse = true
continue
}
// climb back up the tree, but make sure we don't descend right back to where we were
if c.GoToParent() {
depth--
recurse = false
continue
}
break
}
}

45
tree_test.go Normal file
View File

@@ -0,0 +1,45 @@
package jsurls
import (
"strconv"
"testing"
sitter "github.com/smacker/go-tree-sitter"
"github.com/smacker/go-tree-sitter/javascript"
)
func TestCleanURL(t *testing.T) {
cases := []struct {
JS []byte
Expected string
}{
{[]byte(`"./login.php?redirect="+url`), "./login.php?redirect=EXPR"},
{[]byte(`'/path/'+['one', 'two', 'three'].join('/')`), "/path/EXPR"},
{[]byte(`someVar`), "EXPR"},
}
parser := sitter.NewParser()
parser.SetLanguage(javascript.GetLanguage())
for i, c := range cases {
t.Run(strconv.Itoa(i), func(t *testing.T) {
tree := parser.Parse(nil, c.JS)
root := tree.RootNode()
// Example tree:
// program
// expression_statement
// binary_expression
// left: string ("./login.php?redirect=")
// right: identifier (url)
//
// We want the binary_expression to pass to cleanURL, which is
// the first Named Child of the first Named Child of the root node.
actual := cleanURL(root.NamedChild(0).NamedChild(0), c.JS)
if actual != c.Expected {
t.Errorf("want %s for cleanURL(%s), have: %s", c.Expected, c.JS, actual)
}
})
}
}

119
url-match-jquery.go Normal file
View File

@@ -0,0 +1,119 @@
package jsurls
import (
"strings"
sitter "github.com/smacker/go-tree-sitter"
"golang.org/x/exp/slices"
)
func matchJQuery() urlMatcher {
return urlMatcher{"call_expression", func(n *sitter.Node, source []byte) *URL {
callName := content(n.ChildByFieldName("function"), source)
if !slices.Contains(
[]string{
"$.get", "$.post", "$.ajax",
"jQuery.get", "jQuery.post", "jQuery.ajax",
},
callName,
) {
return nil
}
// The jQuery ajax calls have a few different call signatures
// that we need to account for:
// jQuery.post( url [, data ] [, success ] [, dataType ] )
// jQuery.get( url [, data ] [, success ] [, dataType ] )
// jQuery.ajax( url [, settings ] )
// jQuery.post( [settings] )
// jQuery.get( [settings] )
// jQuery.ajax( [settings] )
//
// So we end up with three scenarios to deal with:
// 1. The URL comes first, then a data object
// 2. The URL comes first, then a settings object
// 3. A settings object comes first.
arguments := n.ChildByFieldName("arguments")
if arguments == nil {
return nil
}
firstArg := arguments.NamedChild(0)
if firstArg == nil {
return nil
}
secondArg := arguments.NamedChild(1)
m := &URL{
Type: callName,
Source: content(n, source),
}
// Infer the method for .post and .get calls
if strings.HasSuffix(callName, ".post") {
m.Method = "POST"
} else if strings.HasSuffix(callName, ".get") {
m.Method = "GET"
}
var settingsNode *sitter.Node
if isStringy(firstArg, source) {
// first argument is the URL
m.URL = cleanURL(firstArg, source)
// If the first arg is a URL, the second arg is a
// settings object for $.ajax, or a data object for
// $.get and $.post
if strings.HasSuffix(callName, ".ajax") {
settingsNode = secondArg
} else {
params := newObject(secondArg, source).getKeys()
if m.Method == "GET" {
m.QueryParams = params
} else {
m.BodyParams = params
}
}
}
if firstArg.Type() == "object" {
// first argument is a settings object
settingsNode = firstArg
}
if settingsNode == nil {
// we didn't end up with a settings node,
// so we can't infer anything else
return m
}
settings := newObject(settingsNode, source)
if m.URL == "" {
m.URL = cleanURL(settings.getNode("url"), source)
}
m.Headers = settings.getObject("headers").asMap()
if m.Method == "" {
// method can be specified as either `method`, or
// `type`, and defaults to GET
m.Method = settings.getString(
"method",
settings.getString("type", "GET"),
)
}
params := settings.getObject("data").getKeys()
if m.Method == "GET" {
m.QueryParams = params
} else {
m.BodyParams = params
}
return m
}}
}

167
url-match-xhr.go Normal file
View File

@@ -0,0 +1,167 @@
package jsurls
import (
"strings"
"sync"
sitter "github.com/smacker/go-tree-sitter"
"golang.org/x/exp/slices"
)
type nodeCache struct {
sync.RWMutex
data map[*sitter.Node][]*sitter.Node
}
func newNodeCache() *nodeCache {
return &nodeCache{
data: make(map[*sitter.Node][]*sitter.Node),
}
}
func (c *nodeCache) set(k *sitter.Node, v []*sitter.Node) {
c.Lock()
c.data[k] = v
c.Unlock()
}
func (c *nodeCache) get(k *sitter.Node) ([]*sitter.Node, bool) {
c.RLock()
v, exists := c.data[k]
c.RUnlock()
return v, exists
}
func matchXHR() urlMatcher {
cache := newNodeCache()
return urlMatcher{"call_expression", func(n *sitter.Node, source []byte) *URL {
callName := content(n.ChildByFieldName("function"), source)
// We don't know what the XMLHttpRequest object will be called,
// so we have to focus on just the .open bit
if !strings.HasSuffix(callName, ".open") {
return nil
}
// There's a bunch of different stuff we might have matched,
// including window.open, so we're going to try and guess
// based on the first argument being a valid HTTP method.
// This will miss cases where the method is a variable.
arguments := n.ChildByFieldName("arguments")
method := dequote(content(arguments.NamedChild(0), source))
if !slices.Contains(
[]string{"GET", "HEAD", "OPTIONS", "POST", "PUT", "PATCH", "DELETE"},
method,
) {
return nil
}
urlArg := arguments.NamedChild(1)
if !isStringy(urlArg, source) {
return nil
}
match := &URL{
URL: cleanURL(urlArg, source),
Method: method,
Type: "XMLHttpRequest.open",
Source: content(n, source),
}
// to find headers we need to look for calls to setRequestHeader() on
// the same object as the .open call. We'll stick to the same scope
// (i.e. sibling expressions) because we have no way to know if we're
// dealing with the same object or not otherwise.
objectName := strings.TrimSuffix(callName, ".open")
// We want to find the parent/ancestor node that defines the scope in which
// we are calling XHR.open(). JavaScript has three types of scope: global,
// function, and block. Block scope only comes into play if values
// are defined using 'let', or 'const'. We don't know if the XHR object
// was defined with let or const, so we're just going to ignore block scope.
// That leaves us with global scope and function scope. To find those we
// can ascend the tree until we hit a node with type "function_declaration",
// or we hit a nil parent.
parent := n.Parent()
if parent == nil {
return match
}
for {
candidate := parent.Parent()
if candidate == nil {
break
}
parent = candidate
if parent.Type() == "function_declaration" {
break
}
}
// Look for call_expressions under the same parent as our .open call.
// It's common to end up querying the exact same parent over and over
// again, so we cache the results on a per-parent node basis.
nodes := make([]*sitter.Node, 0)
if v, exists := cache.get(parent); exists {
nodes = v
} else {
q := `
(call_expression
function: (member_expression
object: (identifier)
property: (property_identifier)
)
arguments: (arguments (string))
) @matches
`
query(parent, q, func(sibling *sitter.Node) {
nodes = append(nodes, sibling)
})
cache.set(parent, nodes)
}
headers := make(map[string]string, 0)
// TODO: I think we can get more accuracy here by relying on the fact that
// the .setRequestHeader calls we're interested in must come *after* the .open
// call in order to be valid. In theory that means we can skip any nodes at
// all that come before the .open call we're currently looking at. We could
// also stop looking after we see a .send call on the same object, although
// it's possible for the .send to be wrapped in a conditional so that might
// cause us to miss some values.
for _, sibling := range nodes {
name := content(sibling.ChildByFieldName("function"), source)
if !strings.HasSuffix(name, ".setRequestHeader") {
continue
}
if !strings.HasPrefix(name, objectName) {
continue
}
args := sibling.ChildByFieldName("arguments")
headerNode := args.NamedChild(0)
if headerNode == nil || headerNode.Type() != "string" {
continue
}
header := dequote(content(headerNode, source))
if _, exists := headers[header]; exists {
continue
}
var value string
valueNode := args.NamedChild(1)
if valueNode != nil && valueNode.Type() == "string" {
value = dequote(content(valueNode, source))
}
headers[header] = value
}
match.Headers = headers
return match
}}
}

284
url-matchers.go Normal file
View File

@@ -0,0 +1,284 @@
package jsurls
import (
"net/url"
"regexp"
"strings"
sitter "github.com/smacker/go-tree-sitter"
)
// a URL is any URL found in the source code with accompanying details
type URL struct {
URL string `json:"url"`
QueryParams []string `json:"queryParams"`
BodyParams []string `json:"bodyParams"`
Method string `json:"method"`
Headers map[string]string `json:"headers,omitempty"`
ContentType string `json:"contentType,omitempty"`
// some description like locationAssignment, fetch, $.post or something like that
Type string `json:"type"`
// full source/content of the node; is optional
Source string `json:"source,omitempty"`
// the filename in which the match was found
Filename string `json:"filename,omitempty"`
}
// GetURLs searches the JavaScript source code for absolute and relative URLs and returns
// a slice of results.
func (a *Analyzer) GetURLs() []*URL {
matches := make([]*URL, 0)
re := regexp.MustCompile("[^A-Z-a-z]")
// function to run on entry to each node in the tree
enter := func(n *sitter.Node) {
for _, matcher := range a.urlMatchers {
if matcher.Type != n.Type() {
continue
}
match := matcher.Fn(n, a.source)
if match == nil {
continue
}
// decode any escapes in the URL
match.URL = DecodeString(match.URL)
// an empty slice is easier to deal with than null, e.g when using jq
if match.QueryParams == nil {
match.QueryParams = []string{}
}
if match.BodyParams == nil {
match.BodyParams = []string{}
}
// Filter out data: and tel: schemes
lower := strings.ToLower(match.URL)
if strings.HasPrefix(lower, "data:") || strings.HasPrefix(lower, "tel:") {
continue
}
// Look for URLs that are entirely made up of EXPR replacements
// and skip them. Maybe this should be optional? Maybe it should
// remove things like EXPR#EXPR etc too
letters := re.ReplaceAllString(match.URL, "")
if strings.ReplaceAll(letters, "EXPR", "") == "" {
continue
}
// Parse any query params out of the URL and add them. Some, but not
// all of the matchers will add query params, so we want to do it here
// and then remove duplicates
u, err := url.Parse(match.URL)
if err == nil {
// manually disallow www.w3.org just because it shows up so damn often
if u.Hostname() == "www.w3.org" {
continue
}
for p, _ := range u.Query() {
// Ignore params that were expressions
if p == "EXPR" {
continue
}
match.QueryParams = append(match.QueryParams, p)
}
}
match.QueryParams = unique(match.QueryParams)
matches = append(matches, match)
}
}
// find the nodes we need in the the tree and run the enter function for every node
query(a.rootNode, "[(assignment_expression) (call_expression) (string)] @matches", enter)
return matches
}
func unique[T comparable](items []T) []T {
set := make(map[T]any)
for _, item := range items {
set[item] = struct{}{}
}
out := make([]T, len(set))
i := 0
for item, _ := range set {
out[i] = item
i++
}
return out
}
// a URLMatcher has a type of thing it matches against (e.g. assignment_expression),
// and a function to actually do the matching and producing of the *URL
type URLMatcher struct {
Type string
Fn func(*sitter.Node, []byte) *URL
}
func AllURLMatchers() []URLMatcher {
assignmentNames := newSet([]string{
"location",
"this.url",
"this._url",
"this.baseUrl",
})
isInterestingAssignment := func(name string) bool {
if assignmentNames.Contains(name) {
return true
}
if strings.HasSuffix(name, ".href") {
return true
}
if strings.HasSuffix(name, ".location") {
return true
}
return false
}
matchers := []URLMatcher{
// XMLHttpRequest.open(method, url)
matchXHR(),
// $.post, $.get, and $.ajax
matchJQuery(),
// location assignment
{"assignment_expression", func(n *sitter.Node, source []byte) *URL {
left := n.ChildByFieldName("left")
right := n.ChildByFieldName("right")
if !isInterestingAssignment(content(left, source)) {
return nil
}
// We want to find values that at least *start* with a string of some kind.
// This might be kind of useful to the crawler:
//
// location.href = "/somePath/" + someVar;
//
// Where as this tends to end up being kind of useless:
//
// location.href = someVar + "/somePath/";
//
// So while we might miss out on some things this way, they probably wouldn't
// have been super useful to anything automated anyway.
rightContent := content(right, source)
if len(rightContent) < 2 {
return nil
}
p := rightContent[0:1]
if p != `"` && p != "'" && p != "`" {
return nil
}
return &URL{
URL: cleanURL(right, source),
Method: "GET",
Type: "locationAssignment",
Source: content(n, source),
}
}},
// location replacement
{"call_expression", func(n *sitter.Node, source []byte) *URL {
callName := content(n.ChildByFieldName("function"), source)
if !strings.HasSuffix(callName, "location.replace") {
return nil
}
arguments := n.ChildByFieldName("arguments")
// check the argument contains at least one string literal
if !hasDescendantOfType(arguments.NamedChild(0), "string") {
return nil
}
return &URL{
URL: cleanURL(arguments.NamedChild(0), source),
Method: "GET",
Type: "locationReplacement",
Source: content(n, source),
}
}},
// window.open(url)
{"call_expression", func(n *sitter.Node, source []byte) *URL {
callName := content(n.ChildByFieldName("function"), source)
if callName != "window.open" && callName != "open" {
return nil
}
arguments := n.ChildByFieldName("arguments")
// check the argument contains at least one string literal
if !hasDescendantOfType(arguments.NamedChild(0), "string") {
return nil
}
return &URL{
URL: cleanURL(arguments.NamedChild(0), source),
Method: "GET",
Type: "window.open",
Source: content(n, source),
}
return nil
}},
// fetch(url, [init])
{"call_expression", func(n *sitter.Node, source []byte) *URL {
callName := content(n.ChildByFieldName("function"), source)
if callName != "fetch" {
return nil
}
arguments := n.ChildByFieldName("arguments")
// check the argument contains at least one string literal
if !hasDescendantOfType(arguments.NamedChild(0), "string") {
return nil
}
init := newObject(arguments.NamedChild(1), source)
return &URL{
URL: cleanURL(arguments.NamedChild(0), source),
Method: init.getString("method", "GET"),
Headers: init.getObject("headers").asMap(),
ContentType: init.getObject("headers").getStringI("content-type", ""),
Type: "fetch",
Source: content(n, source),
}
return nil
}},
// string literals
{"string", func(n *sitter.Node, source []byte) *URL {
trimmed := dequote(content(n, source))
if !MaybeURL(trimmed) {
return nil
}
return &URL{
URL: trimmed,
Type: "stringLiteral",
Source: content(n, source),
}
}},
}
return matchers
}