Import of jsurls
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
*.sw*
|
||||
cpu.pprof
|
||||
8
README.mkd
Normal file
8
README.mkd
Normal file
@@ -0,0 +1,8 @@
|
||||
# jsluice
|
||||
|
||||
A Go package and tool for extracting URLs, secrets, and other interesting data from JavaScript files.
|
||||
Uses [go-tree-sitter](https://github.com/smacker/go-tree-sitter) for parsing.
|
||||
|
||||
|
||||
|
||||
|
||||
31
analyzer.go
Normal file
31
analyzer.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"github.com/smacker/go-tree-sitter/javascript"
|
||||
)
|
||||
|
||||
// Analyzer could be considered the core type of jsluice. It wraps
|
||||
// the parse tree for a JavaScript file and provides mechanisms to
|
||||
// extract URLs, secrets etc
|
||||
type Analyzer struct {
|
||||
source []byte
|
||||
parser *sitter.Parser
|
||||
urlMatchers []URLMatcher
|
||||
rootNode *sitter.Node
|
||||
}
|
||||
|
||||
// NewAnalyzer accepts a slice of bytes representing some JavaScript
|
||||
// source code and returns a pointer to a new Analyzer
|
||||
func NewAnalyzer(source []byte) *Analyzer {
|
||||
parser := sitter.NewParser()
|
||||
parser.SetLanguage(javascript.GetLanguage())
|
||||
tree := parser.Parse(nil, source)
|
||||
|
||||
return &Analyzer{
|
||||
source: source,
|
||||
parser: parser,
|
||||
urlMatchers: AllURLMatchers(),
|
||||
rootNode: tree.RootNode(),
|
||||
}
|
||||
}
|
||||
21
analyzer_test.go
Normal file
21
analyzer_test.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package jsurls
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestAnalyzerBasic(t *testing.T) {
|
||||
a := NewAnalyzer([]byte(`
|
||||
function foo(){
|
||||
document.location = "/logout"
|
||||
}
|
||||
`))
|
||||
|
||||
urls := a.GetURLs()
|
||||
|
||||
if len(urls) != 1 {
|
||||
t.Errorf("Expected exactly 1 URL; got %d", len(urls))
|
||||
}
|
||||
|
||||
if urls[0].URL != "/logout" {
|
||||
t.Errorf("Expected first URL to be '/logout'; got %s", urls[0].URL)
|
||||
}
|
||||
}
|
||||
1
cmd/jsurls-sinks/.gitignore
vendored
Normal file
1
cmd/jsurls-sinks/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
jsurls-sinks
|
||||
6
cmd/jsurls-sinks/README.mkd
Normal file
6
cmd/jsurls-sinks/README.mkd
Normal file
@@ -0,0 +1,6 @@
|
||||
# jsurls-sinks
|
||||
|
||||
Development tool for finding places in JavaScript files that use URLs.
|
||||
|
||||
The idea is you can run this against a whole bunch of JS files and it will spit out places that
|
||||
might be good to add to the main `jsurls` matchers.
|
||||
194
cmd/jsurls-sinks/main.go
Normal file
194
cmd/jsurls-sinks/main.go
Normal file
@@ -0,0 +1,194 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"github.com/smacker/go-tree-sitter/javascript"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
reWhitespace := regexp.MustCompile(`\s{2,}`)
|
||||
reJSName := regexp.MustCompile(`^[a-zA-Z0-9_$.-]+$`)
|
||||
|
||||
flag.Parse()
|
||||
source, err := ioutil.ReadFile(flag.Arg(0))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
parser := sitter.NewParser()
|
||||
parser.SetLanguage(javascript.GetLanguage())
|
||||
|
||||
tree := parser.Parse(nil, source)
|
||||
root := tree.RootNode()
|
||||
|
||||
enter := func(n *sitter.Node) {
|
||||
switch n.Type() {
|
||||
case "assignment_expression":
|
||||
left := n.ChildByFieldName("left")
|
||||
right := n.ChildByFieldName("right")
|
||||
if left == nil || right == nil {
|
||||
return
|
||||
}
|
||||
|
||||
rightContent := right.Content(source)
|
||||
if !startsWithString(rightContent) {
|
||||
return
|
||||
}
|
||||
|
||||
rightContent = reWhitespace.ReplaceAllString(rightContent, " ")
|
||||
rightStr := dequote(right.Content(source))
|
||||
|
||||
if couldBePath(rightStr) {
|
||||
fmt.Printf("%s (assignment)\n", left.Content(source))
|
||||
}
|
||||
|
||||
case "call_expression":
|
||||
callName := n.ChildByFieldName("function").Content(source)
|
||||
// It's common to find things like immediately called anonymous functions
|
||||
// in JS source, and we don't care about those because we could never match
|
||||
// on them
|
||||
if !reJSName.MatchString(callName) {
|
||||
return
|
||||
}
|
||||
|
||||
arguments := n.ChildByFieldName("arguments")
|
||||
if arguments == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// we want to iterate over the arguments and find
|
||||
// any that look like a url
|
||||
c := sitter.NewTreeCursor(arguments)
|
||||
defer c.Close()
|
||||
|
||||
// no args
|
||||
if !c.GoToFirstChild() {
|
||||
return
|
||||
}
|
||||
|
||||
foundPath := false
|
||||
position := 0
|
||||
for {
|
||||
arg := c.CurrentNode()
|
||||
if arg == nil {
|
||||
break
|
||||
}
|
||||
|
||||
// named args only (i.e. don't count commas etc)
|
||||
if arg.IsNamed() {
|
||||
|
||||
argContent := arg.Content(source)
|
||||
if startsWithString(argContent) && couldBePath(dequote(argContent)) {
|
||||
foundPath = true
|
||||
break
|
||||
}
|
||||
position++
|
||||
}
|
||||
|
||||
if !c.GoToNextSibling() {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if foundPath {
|
||||
fmt.Printf("%s (arg %d)\n", callName, position)
|
||||
}
|
||||
}
|
||||
}
|
||||
queryNodes(root, enter)
|
||||
}
|
||||
|
||||
func startsWithString(in string) bool {
|
||||
if len(in) < 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
p := in[0:1]
|
||||
if p == `"` || p == "'" || p == "`" {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func couldBePath(in string) bool {
|
||||
|
||||
if (strings.HasPrefix(in, "http:") && len(in) > 7) ||
|
||||
(strings.HasPrefix(in, "https:") && len(in) > 8) ||
|
||||
(strings.HasPrefix(in, "/") && len(in) > 3) ||
|
||||
(strings.HasPrefix(in, "./") && len(in) > 4) {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func queryNodes(n *sitter.Node, enter func(*sitter.Node)) {
|
||||
|
||||
query, err := sitter.NewQuery(
|
||||
[]byte("[(assignment_expression) (call_expression)] @matches"),
|
||||
javascript.GetLanguage(),
|
||||
)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
qc := sitter.NewQueryCursor()
|
||||
defer qc.Close()
|
||||
|
||||
qc.Exec(query, n)
|
||||
|
||||
for {
|
||||
match, exists := qc.NextMatch()
|
||||
if !exists || match == nil {
|
||||
break
|
||||
}
|
||||
|
||||
for _, capture := range match.Captures {
|
||||
enter(capture.Node)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func walk(n *sitter.Node, enter func(*sitter.Node)) {
|
||||
|
||||
c := sitter.NewTreeCursor(n)
|
||||
defer c.Close()
|
||||
|
||||
// walkies
|
||||
recurse := true
|
||||
for {
|
||||
// descend into the tree
|
||||
if recurse && c.GoToFirstChild() {
|
||||
recurse = true
|
||||
enter(c.CurrentNode())
|
||||
continue
|
||||
}
|
||||
|
||||
// move sideways
|
||||
if c.GoToNextSibling() {
|
||||
recurse = true
|
||||
enter(c.CurrentNode())
|
||||
continue
|
||||
}
|
||||
|
||||
// climb back up the tree, but make sure we don't descend right back to where we were
|
||||
if c.GoToParent() {
|
||||
recurse = false
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func dequote(in string) string {
|
||||
return strings.Trim(in, "'\"`")
|
||||
}
|
||||
1
cmd/jsurls/.gitignore
vendored
Normal file
1
cmd/jsurls/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
jsurls
|
||||
77
cmd/jsurls/README.mkd
Normal file
77
cmd/jsurls/README.mkd
Normal file
@@ -0,0 +1,77 @@
|
||||
# jsurls
|
||||
|
||||
Extract URLs and their parameters from JavaScript files. Uses [go-tree-sitter](https://github.com/smacker/go-tree-sitter) for parsing.
|
||||
|
||||
## Install
|
||||
|
||||
Run `go install` in this directory, or `go install github.com/bishopfoxmss/jsurls/cmd/jsurls@latest`
|
||||
|
||||
If you want the `go install github.com/bish...` command to work, you'll need to make sure you have configured git and Go
|
||||
to behave properly. In your `~/.gitconfig` you need this to make sure clones are done with SSH:
|
||||
|
||||
```
|
||||
[url "git@github.com:"]
|
||||
insteadOf = https://github.com/
|
||||
```
|
||||
|
||||
You also need to tell Go not to use their public proxy for packages under `github.com/bishopfoxmss/`.
|
||||
Run this, and/or put this in your `~/.bashrc` or equivalent:
|
||||
|
||||
```
|
||||
go env -w GOPRIVATE='github.com/bishopfoxmss/*'
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Regular usage is with `jsurls <filename>`. The output is a JSON stream, so you'll probably
|
||||
want to pipe to `jq` for formatting in most cases:
|
||||
|
||||
```
|
||||
▶ jsurls testdata/jquery-post.js | jq
|
||||
{
|
||||
"url": "demo_test_post.asp",
|
||||
"params": [
|
||||
"name",
|
||||
"city"
|
||||
],
|
||||
"method": "POST",
|
||||
"type": "$.post"
|
||||
}
|
||||
{
|
||||
"url": "/logout.php",
|
||||
"params": [
|
||||
"redirect"
|
||||
],
|
||||
"method": "GET",
|
||||
"type": "$.get"
|
||||
}
|
||||
```
|
||||
|
||||
You can print the tree for a JS file with the `--tree`/`-t` flag. This is a useful reference when writing matchers:
|
||||
|
||||
```
|
||||
▶ jsurls testdata/hello.js --tree
|
||||
program
|
||||
expression_statement
|
||||
call_expression
|
||||
function: member_expression
|
||||
object: identifier (console)
|
||||
property: property_identifier (log)
|
||||
arguments: arguments
|
||||
string ("Hello, world!")
|
||||
```
|
||||
|
||||
You can see the source code for each match with the `--include-source`/`-i` flag:
|
||||
|
||||
```
|
||||
▶ jsurls testdata/jquery-post.js --include-source | head -n1 | jq .source -r
|
||||
$.post("demo_test_post.asp",
|
||||
{
|
||||
name: "Donald Duck",
|
||||
city: "Duckburg"
|
||||
},
|
||||
function(data, status){
|
||||
alert("Data: " + data + "\nStatus: " + status);
|
||||
document.location = data.nextURL
|
||||
})
|
||||
```
|
||||
148
cmd/jsurls/main.go
Normal file
148
cmd/jsurls/main.go
Normal file
@@ -0,0 +1,148 @@
|
||||
package main
|
||||
|
||||
// Extract URLs and related stuff out of JavaScript files
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/bishopfoxmss/jsurls"
|
||||
"github.com/pkg/profile"
|
||||
flag "github.com/spf13/pflag"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var treeMode bool
|
||||
flag.BoolVarP(&treeMode, "tree", "t", false, "Just print the tree for the provided file")
|
||||
|
||||
var includeSource bool
|
||||
flag.BoolVarP(&includeSource, "include-source", "i", false, "Include the source code where the URL was found")
|
||||
|
||||
var ignoreStrings bool
|
||||
flag.BoolVar(&ignoreStrings, "ignore-strings", false, "Ignore matches from string literals")
|
||||
|
||||
var includeFilename bool
|
||||
flag.BoolVar(&includeFilename, "include-filename", false, "Include the filename of the matched file in the output")
|
||||
|
||||
var profileMode bool
|
||||
flag.BoolVar(&profileMode, "profile", false, "Profile cpu usage and save a cpu.pprof file in the current dir")
|
||||
|
||||
var concurrency int
|
||||
flag.IntVarP(&concurrency, "concurrency", "c", 1, "Number of files to process concurrently")
|
||||
|
||||
var resolve string
|
||||
flag.StringVarP(&resolve, "resolve", "r", "", "Resolve relative paths using the absolute URL provided")
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if profileMode {
|
||||
defer profile.Start(profile.ProfilePath(".")).Stop()
|
||||
}
|
||||
|
||||
var resolveURL *url.URL
|
||||
var err error
|
||||
if resolve != "" {
|
||||
resolveURL, err = url.Parse(resolve)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "failed to parse resolve URL: %s\n", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var input io.Reader = os.Stdin
|
||||
if flag.Arg(0) != "" {
|
||||
input = strings.NewReader(
|
||||
strings.Join(flag.Args(), "\n"),
|
||||
)
|
||||
}
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
jobs := make(chan string)
|
||||
matches := make(chan *jsurls.Match)
|
||||
|
||||
for i := 0; i < concurrency; i++ {
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
extractor := jsurls.NewExtractor()
|
||||
|
||||
for filename := range jobs {
|
||||
|
||||
source, err := ioutil.ReadFile(filename)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s", err)
|
||||
continue
|
||||
}
|
||||
|
||||
// print just the tree and stop
|
||||
if treeMode {
|
||||
fmt.Printf("%s:\n", filename)
|
||||
jsurls.PrintTree(source)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, m := range extractor.GetMatches(source) {
|
||||
m.Filename = filename
|
||||
matches <- m
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
}
|
||||
|
||||
// read jobs from the input reader, send on jobs channel, close jobs channel
|
||||
go func() {
|
||||
sc := bufio.NewScanner(input)
|
||||
for sc.Scan() {
|
||||
filename := sc.Text()
|
||||
jobs <- filename
|
||||
}
|
||||
close(jobs)
|
||||
|
||||
wg.Wait()
|
||||
close(matches)
|
||||
}()
|
||||
|
||||
// read and filter the results
|
||||
for m := range matches {
|
||||
|
||||
if ignoreStrings && m.Type == "stringLiteral" {
|
||||
continue
|
||||
}
|
||||
|
||||
// remove filename if the user doesn't want it
|
||||
if !includeFilename {
|
||||
m.Filename = ""
|
||||
}
|
||||
|
||||
// remove any souce if we don't want to display it
|
||||
if !includeSource {
|
||||
m.Source = ""
|
||||
}
|
||||
|
||||
if resolveURL != nil {
|
||||
parsed, err := url.Parse(m.URL)
|
||||
if err == nil {
|
||||
m.URL = resolveURL.ResolveReference(parsed).String()
|
||||
}
|
||||
}
|
||||
|
||||
j, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s\n", err)
|
||||
continue
|
||||
}
|
||||
fmt.Printf("%s\n", j)
|
||||
}
|
||||
|
||||
}
|
||||
1
cmd/treehugger/.gitignore
vendored
Normal file
1
cmd/treehugger/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
treehugger
|
||||
64
cmd/treehugger/main.go
Normal file
64
cmd/treehugger/main.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"github.com/smacker/go-tree-sitter/javascript"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
flag.Parse()
|
||||
queryStr := flag.Arg(0)
|
||||
|
||||
parser := sitter.NewParser()
|
||||
parser.SetLanguage(javascript.GetLanguage())
|
||||
|
||||
sc := bufio.NewScanner(os.Stdin)
|
||||
for sc.Scan() {
|
||||
source, err := ioutil.ReadFile(sc.Text())
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "error opening file: %s\n", err)
|
||||
continue
|
||||
}
|
||||
|
||||
enter := func(n *sitter.Node) {
|
||||
content := n.Content(source)
|
||||
fmt.Println(content)
|
||||
}
|
||||
|
||||
tree := parser.Parse(nil, source)
|
||||
root := tree.RootNode()
|
||||
|
||||
query(root, queryStr, enter)
|
||||
}
|
||||
}
|
||||
|
||||
func query(n *sitter.Node, queryStr string, enter func(*sitter.Node)) {
|
||||
|
||||
q, err := sitter.NewQuery([]byte(queryStr), javascript.GetLanguage())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
qc := sitter.NewQueryCursor()
|
||||
defer qc.Close()
|
||||
|
||||
qc.Exec(q, n)
|
||||
|
||||
for {
|
||||
match, exists := qc.NextMatch()
|
||||
if !exists || match == nil {
|
||||
break
|
||||
}
|
||||
|
||||
for _, capture := range match.Captures {
|
||||
enter(capture.Node)
|
||||
}
|
||||
}
|
||||
}
|
||||
10
go.mod
Normal file
10
go.mod
Normal file
@@ -0,0 +1,10 @@
|
||||
module github.com/bishopfoxmss/jsluice
|
||||
|
||||
go 1.18
|
||||
|
||||
require (
|
||||
github.com/pkg/profile v1.6.0 // indirect
|
||||
github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
golang.org/x/exp v0.0.0-20220706164943-b4a6d9510983 // indirect
|
||||
)
|
||||
24
go.sum
Normal file
24
go.sum
Normal file
@@ -0,0 +1,24 @@
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/pkg/profile v1.6.0 h1:hUDfIISABYI59DyeB3OTay/HxSRwTQ8rB/H83k6r5dM=
|
||||
github.com/pkg/profile v1.6.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/smacker/go-tree-sitter v0.0.0-20220421092837-ec55f7cfeaf4 h1:UFOHRX5nrxNCVORhicjy31nzSVt9rEjf/YRcx2Dc3MM=
|
||||
github.com/smacker/go-tree-sitter v0.0.0-20220421092837-ec55f7cfeaf4/go.mod h1:EiUuVMUfLQj8Sul+S8aKWJwQy7FRYnJCO2EWzf8F5hk=
|
||||
github.com/smacker/go-tree-sitter v0.0.0-20220623130553-1191a8204295 h1:z1lMT/t6SS2A1nYHqcn/61C4uJptpdTyWGnh2P7xBpE=
|
||||
github.com/smacker/go-tree-sitter v0.0.0-20220623130553-1191a8204295/go.mod h1:q99oHDsbP0xRwmn7Vmob8gbSMNyvJ83OauXPSuHQuKE=
|
||||
github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11 h1:l4ch+twh4vEZ5VDPyiqC/6h8BhGWHiDxdFRN4M/ZAck=
|
||||
github.com/smacker/go-tree-sitter v0.0.0-20220628134258-ac06e95cfa11/go.mod h1:q99oHDsbP0xRwmn7Vmob8gbSMNyvJ83OauXPSuHQuKE=
|
||||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
|
||||
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.4/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
golang.org/x/exp v0.0.0-20220706164943-b4a6d9510983 h1:sUweFwmLOje8KNfXAVqGGAsmgJ/F8jJ6wBLJDt4BTKY=
|
||||
golang.org/x/exp v0.0.0-20220706164943-b4a6d9510983/go.mod h1:Kr81I6Kryrl9sr8s2FK3vxD90NdsKWRuOIl2O4CvYbA=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
74
maybeurl.go
Normal file
74
maybeurl.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var fileExtensions set
|
||||
|
||||
func init() {
|
||||
fileExtensions = newSet([]string{
|
||||
"js", "css", "html", "htm", "xhtml", "xlsx",
|
||||
"xls", "docx", "doc", "pdf", "rss", "xml",
|
||||
"php", "phtml", "asp", "aspx", "asmx", "ashx",
|
||||
"cgi", "pl", "rb", "py", "do", "jsp",
|
||||
"jspa", "json", "jsonp", "txt",
|
||||
})
|
||||
}
|
||||
|
||||
func MaybeURL(in string) bool {
|
||||
// This should eliminate a pretty big percentage of
|
||||
// string literals that we find, and avoid spending
|
||||
// the resources on parsing them as URLs
|
||||
if !strings.ContainsAny(in, "/?") {
|
||||
return false
|
||||
}
|
||||
|
||||
// We want to be fairly restrictive to cut out things
|
||||
// like regex strings, blocks of HTML etc. We will miss
|
||||
// a handful of URLs this way, but that's probably
|
||||
// better than spitting out a ton of false-positives
|
||||
if strings.ContainsAny(in, " ()!<>'\"`{}^$,") {
|
||||
return false
|
||||
}
|
||||
|
||||
// Let's attempt to parse it as a URL, so we can
|
||||
// do some analysis on the individual parts
|
||||
u, err := url.Parse(in)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// Valid-scheme?
|
||||
if u.Scheme != "" {
|
||||
s := strings.ToLower(u.Scheme)
|
||||
if s != "http" && s != "https" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Valid-looking hostname?
|
||||
if len(strings.Split(u.Hostname(), ".")) > 1 {
|
||||
return true
|
||||
}
|
||||
|
||||
// Valid query string with at least one value?
|
||||
for _, vv := range u.Query() {
|
||||
if len(vv) > 0 && len(vv[0]) > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Known file extensions is the last thing we want to
|
||||
// check so if there's no dot then it's a no from us.
|
||||
if !strings.ContainsAny(u.Path, ".") {
|
||||
return false
|
||||
}
|
||||
|
||||
parts := strings.Split(u.Path, ".")
|
||||
ext := parts[len(parts)-1]
|
||||
|
||||
return fileExtensions.Contains(ext)
|
||||
|
||||
}
|
||||
39
maybeurl_test.go
Normal file
39
maybeurl_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMaybeURL(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
expected bool
|
||||
}{
|
||||
{"https://example.com", true},
|
||||
{"https://example.net/api/v1", true},
|
||||
{"HTTP://example.net/api/v1", true},
|
||||
{"application/json", false},
|
||||
{"text/plain", false},
|
||||
{"//example.org", true},
|
||||
{"example.org", false},
|
||||
{"foo?id=123", true},
|
||||
{"Who? Me?", false},
|
||||
{"foo.php?id", true},
|
||||
{"foo.lolno?id", false},
|
||||
{"/foo/bar.html", true},
|
||||
{"./foo/bar.html", true},
|
||||
{`~[A-Z](?=[/|([{\u003c\\\"'])`, false},
|
||||
|
||||
// These might look like paths to humans, but we couldn't
|
||||
// be confident enough about them programmatically
|
||||
{"./", false},
|
||||
{"foo/bar", false},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := MaybeURL(c.in)
|
||||
if actual != c.expected {
|
||||
t.Errorf("want %t for MaybeURL(%s); have %t", c.expected, c.in, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
112
objects.go
Normal file
112
objects.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
)
|
||||
|
||||
type object struct {
|
||||
node *sitter.Node
|
||||
source []byte
|
||||
}
|
||||
|
||||
func newObject(n *sitter.Node, source []byte) object {
|
||||
return object{
|
||||
node: n,
|
||||
source: source,
|
||||
}
|
||||
}
|
||||
|
||||
func (o object) asMap() map[string]string {
|
||||
out := make(map[string]string, 0)
|
||||
if !o.hasValidNode() {
|
||||
return out
|
||||
}
|
||||
|
||||
for _, k := range o.getKeys() {
|
||||
out[k] = o.getString(k, "")
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (o object) hasValidNode() bool {
|
||||
return o.node != nil && o.node.Type() == "object"
|
||||
}
|
||||
|
||||
func (o object) getNodeFunc(fn func(key string) bool) *sitter.Node {
|
||||
if !o.hasValidNode() {
|
||||
return nil
|
||||
}
|
||||
|
||||
count := int(o.node.NamedChildCount())
|
||||
|
||||
for i := 0; i < count; i++ {
|
||||
pair := o.node.NamedChild(i)
|
||||
|
||||
if pair.Type() != "pair" {
|
||||
continue
|
||||
}
|
||||
|
||||
if !fn(dequote(content(pair.ChildByFieldName("key"), o.source))) {
|
||||
continue
|
||||
}
|
||||
|
||||
return pair.ChildByFieldName("value")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o object) getNode(key string) *sitter.Node {
|
||||
return o.getNodeFunc(func(candidate string) bool {
|
||||
return key == candidate
|
||||
})
|
||||
}
|
||||
|
||||
func (o object) getNodeI(key string) *sitter.Node {
|
||||
key = strings.ToLower(key)
|
||||
return o.getNodeFunc(func(candidate string) bool {
|
||||
return key == strings.ToLower(candidate)
|
||||
})
|
||||
}
|
||||
|
||||
func (o object) getKeys() []string {
|
||||
out := make([]string, 0)
|
||||
if !o.hasValidNode() {
|
||||
return out
|
||||
}
|
||||
|
||||
count := int(o.node.NamedChildCount())
|
||||
|
||||
for i := 0; i < count; i++ {
|
||||
pair := o.node.NamedChild(i)
|
||||
|
||||
if pair.Type() != "pair" {
|
||||
continue
|
||||
}
|
||||
|
||||
key := dequote(content(pair.ChildByFieldName("key"), o.source))
|
||||
out = append(out, key)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (o object) getObject(key string) object {
|
||||
return newObject(o.getNode(key), o.source)
|
||||
}
|
||||
|
||||
func (o object) getString(key, defaultVal string) string {
|
||||
value := o.getNode(key)
|
||||
if value == nil || value.Type() != "string" {
|
||||
return defaultVal
|
||||
}
|
||||
return dequote(content(value, o.source))
|
||||
}
|
||||
|
||||
func (o object) getStringI(key, defaultVal string) string {
|
||||
value := o.getNodeI(key)
|
||||
if value == nil || value.Type() != "string" {
|
||||
return defaultVal
|
||||
}
|
||||
return dequote(content(value, o.source))
|
||||
}
|
||||
16
set.go
Normal file
16
set.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package jsurls
|
||||
|
||||
type set map[string]any
|
||||
|
||||
func newSet(items []string) set {
|
||||
s := make(set)
|
||||
for _, item := range items {
|
||||
s[item] = struct{}{}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func (s set) Contains(item string) bool {
|
||||
_, exists := s[item]
|
||||
return exists
|
||||
}
|
||||
215
strings.go
Normal file
215
strings.go
Normal file
@@ -0,0 +1,215 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
type item struct {
|
||||
typ itemType
|
||||
val string
|
||||
}
|
||||
|
||||
type itemType int
|
||||
|
||||
const (
|
||||
itemString itemType = iota
|
||||
itemSingleEscape
|
||||
itemHexEscape
|
||||
itemOctalEscape
|
||||
itemUnicodeEscape
|
||||
itemCodepointEscape
|
||||
)
|
||||
|
||||
func (i item) String() string {
|
||||
switch i.typ {
|
||||
case itemString:
|
||||
return i.val
|
||||
case itemSingleEscape:
|
||||
escapes := map[string]string{
|
||||
"b": "\b",
|
||||
"f": "\f",
|
||||
"n": "\n",
|
||||
"r": "\r",
|
||||
"t": "\t",
|
||||
"v": "\v",
|
||||
}
|
||||
|
||||
if out, exists := escapes[i.val]; exists {
|
||||
return out
|
||||
}
|
||||
return i.val
|
||||
case itemHexEscape, itemUnicodeEscape, itemCodepointEscape:
|
||||
num, err := strconv.ParseInt(i.val, 16, 0)
|
||||
if err != nil {
|
||||
return i.val
|
||||
}
|
||||
return string(rune(num))
|
||||
case itemOctalEscape:
|
||||
num, err := strconv.ParseInt(i.val, 8, 0)
|
||||
if err != nil {
|
||||
return i.val
|
||||
}
|
||||
return string(rune(num))
|
||||
default:
|
||||
return i.val
|
||||
}
|
||||
}
|
||||
|
||||
type stringLexer struct {
|
||||
str string
|
||||
start int
|
||||
pos int
|
||||
items []item
|
||||
done bool
|
||||
}
|
||||
|
||||
func newStringLexer(in string) *stringLexer {
|
||||
return &stringLexer{
|
||||
str: in,
|
||||
start: 0,
|
||||
pos: 0,
|
||||
items: make([]item, 0),
|
||||
done: false,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *stringLexer) Next() rune {
|
||||
if s.pos >= len(s.str) {
|
||||
s.done = true
|
||||
return -1
|
||||
}
|
||||
|
||||
r, l := utf8.DecodeRuneInString(s.str[s.pos:])
|
||||
s.pos += l
|
||||
return r
|
||||
}
|
||||
|
||||
func (s *stringLexer) Backup() {
|
||||
if s.done || s.pos <= 0 {
|
||||
return
|
||||
}
|
||||
_, l := utf8.DecodeLastRuneInString(s.str[:s.pos])
|
||||
s.pos -= l
|
||||
}
|
||||
|
||||
func (s *stringLexer) Peek() rune {
|
||||
r := s.Next()
|
||||
s.Backup()
|
||||
return r
|
||||
}
|
||||
|
||||
func (s *stringLexer) Emit(t itemType) {
|
||||
s.items = append(s.items, item{
|
||||
typ: t,
|
||||
val: s.str[s.start:s.pos],
|
||||
})
|
||||
s.start = s.pos
|
||||
}
|
||||
|
||||
func (s *stringLexer) Ignore() {
|
||||
s.start = s.pos
|
||||
}
|
||||
|
||||
func (s *stringLexer) Accept(valid string) bool {
|
||||
if strings.ContainsRune(valid, s.Next()) {
|
||||
return true
|
||||
}
|
||||
s.Backup()
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *stringLexer) AcceptN(valid string, n int) bool {
|
||||
count := 0
|
||||
for i := 0; i < n; i++ {
|
||||
if s.Accept(valid) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count == n
|
||||
}
|
||||
|
||||
func (s *stringLexer) AcceptUntil(r rune) {
|
||||
for s.Next() != r && !s.done {
|
||||
}
|
||||
s.Backup()
|
||||
}
|
||||
|
||||
func (s *stringLexer) AcceptRun(valid string) {
|
||||
for strings.ContainsRune(valid, s.Next()) {
|
||||
}
|
||||
s.Backup()
|
||||
}
|
||||
|
||||
func (s *stringLexer) String() string {
|
||||
out := &strings.Builder{}
|
||||
for _, i := range s.items {
|
||||
out.WriteString(i.String())
|
||||
}
|
||||
return out.String()
|
||||
}
|
||||
|
||||
func DecodeString(in string) string {
|
||||
in = dequote(in)
|
||||
l := newStringLexer(in)
|
||||
|
||||
validHex := "0123456789abcdefABCDEF"
|
||||
|
||||
for !l.done {
|
||||
l.AcceptUntil('\\')
|
||||
l.Emit(itemString)
|
||||
|
||||
if l.done {
|
||||
break
|
||||
}
|
||||
|
||||
// Ignore the backslash
|
||||
l.Next()
|
||||
l.Ignore()
|
||||
|
||||
switch l.Next() {
|
||||
case 'b', 'f', 'n', 'r', 't', 'v', '\'', '"', '\\':
|
||||
l.Emit(itemSingleEscape)
|
||||
case '0':
|
||||
// It's a \0 (null)
|
||||
if !unicode.IsDigit(l.Peek()) {
|
||||
l.Emit(itemSingleEscape)
|
||||
continue
|
||||
}
|
||||
// It's an octal escape
|
||||
l.AcceptRun("01234567")
|
||||
l.Emit(itemOctalEscape)
|
||||
case 'x':
|
||||
// ignore the x
|
||||
l.Ignore()
|
||||
|
||||
// Exactly 2 hex digits
|
||||
if l.AcceptN(validHex, 2) {
|
||||
l.Emit(itemHexEscape)
|
||||
}
|
||||
case 'u':
|
||||
// ignore the u
|
||||
l.Ignore()
|
||||
|
||||
// e.g. \u{00003d}
|
||||
if l.Accept("{") {
|
||||
l.Ignore()
|
||||
l.AcceptRun(validHex)
|
||||
l.Emit(itemCodepointEscape)
|
||||
if l.Accept("}") {
|
||||
l.Ignore()
|
||||
}
|
||||
}
|
||||
|
||||
// e.g. \u003d
|
||||
if l.AcceptN(validHex, 4) {
|
||||
l.Emit(itemUnicodeEscape)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return l.String()
|
||||
}
|
||||
82
strings_test.go
Normal file
82
strings_test.go
Normal file
@@ -0,0 +1,82 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestStringDecode(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
expected string
|
||||
}{
|
||||
// middle
|
||||
{`"foo bar"`, `foo bar`},
|
||||
{`"foo\\bar"`, `foo\bar`},
|
||||
{`"foo\"bar"`, `foo"bar`},
|
||||
{`"foo\'bar"`, `foo'bar`},
|
||||
{`"foo\075bar"`, `foo=bar`},
|
||||
{`"foo\tbar"`, "foo\tbar"},
|
||||
{`"foo\vbar"`, "foo\vbar"},
|
||||
{`"foo\u003dbar"`, "foo=bar"},
|
||||
{`"foo\u{00000000003d}bar"`, "foo=bar"},
|
||||
|
||||
// end
|
||||
{`"foo\075"`, `foo=`},
|
||||
{`"foo\x3d"`, `foo=`},
|
||||
{`"foo\\"`, `foo\`},
|
||||
|
||||
// start
|
||||
{`"\075foo"`, `=foo`},
|
||||
{`"\x3dfoo"`, `=foo`},
|
||||
{`"\\foo"`, `\foo`},
|
||||
|
||||
// pairs
|
||||
{`"\075\x3d"`, `==`},
|
||||
{`"\u{00000003d}\x3d"`, `==`},
|
||||
|
||||
// Invalid
|
||||
{`"\poo"`, `poo`},
|
||||
{`"\u{0003doops"`, `=oops`},
|
||||
|
||||
// real-world
|
||||
{`"/help/doc/user_ed.jsp?loc\x3dhelp\x26target\x3d"`, "/help/doc/user_ed.jsp?loc=help&target="},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
actual := DecodeString(c.in)
|
||||
if c.expected != actual {
|
||||
t.Errorf("Want %s for DecodeString(%s); have %s", c.expected, c.in, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecodeString(b *testing.B) {
|
||||
inputs := []string{
|
||||
`"foo bar"`,
|
||||
`"foo\\bar"`,
|
||||
`"foo\"bar"`,
|
||||
`"foo\'bar"`,
|
||||
`"foo\075bar"`,
|
||||
`"foo\tbar"`,
|
||||
`"foo\vbar"`,
|
||||
`"foo\u003dbar"`,
|
||||
`"foo\u{00000000003d}bar"`,
|
||||
`"foo\075"`,
|
||||
`"foo\x3d"`,
|
||||
`"foo\\"`,
|
||||
`"\075foo"`,
|
||||
`"\x3dfoo"`,
|
||||
`"\\foo"`,
|
||||
`"\075\x3d"`,
|
||||
`"\u{00000003d}\x3d"`,
|
||||
`"\poo"`,
|
||||
`"\u{0003doops"`,
|
||||
`"/help/doc/user_ed.jsp?loc\x3dhelp\x26target\x3d"`,
|
||||
}
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, input := range inputs {
|
||||
_ = DecodeString(input)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
202
tree.go
Normal file
202
tree.go
Normal file
@@ -0,0 +1,202 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"github.com/smacker/go-tree-sitter/javascript"
|
||||
)
|
||||
|
||||
// nil-safe wrapper around calling node.Content(source)
|
||||
func content(n *sitter.Node, source []byte) string {
|
||||
if n == nil {
|
||||
return ""
|
||||
}
|
||||
return n.Content(source)
|
||||
}
|
||||
|
||||
func isStringy(n *sitter.Node, source []byte) bool {
|
||||
if n.Type() == "string" {
|
||||
return true
|
||||
}
|
||||
|
||||
c := content(n, source)
|
||||
if len(c) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
switch c[0:0] {
|
||||
case `"`, "'", "`":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func hasDescendantOfType(n *sitter.Node, t string) bool {
|
||||
if n == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// node is provided type exactly
|
||||
if n.Type() == t {
|
||||
return true
|
||||
}
|
||||
|
||||
hasType := false
|
||||
enter := func(n *sitter.Node) {
|
||||
if n.Type() == t {
|
||||
hasType = true
|
||||
}
|
||||
}
|
||||
|
||||
walk(n, enter)
|
||||
return hasType
|
||||
}
|
||||
|
||||
// cleanURL takes a node representing a URL and attempts to make it
|
||||
// at least somewhat easily parseable. It's common to build URLs out
|
||||
// of variables and function calls so we want to turn something like:
|
||||
//
|
||||
// './upload.php?profile='+res.id+'&show='+$('.participate_modal_container').attr('data-val')
|
||||
//
|
||||
// Into something more like:
|
||||
//
|
||||
// ./upload.php?profile=EXPR&show=EXPR
|
||||
//
|
||||
func cleanURL(n *sitter.Node, source []byte) string {
|
||||
if n == nil {
|
||||
return ""
|
||||
}
|
||||
switch n.Type() {
|
||||
case "binary_expression":
|
||||
return fmt.Sprintf(
|
||||
"%s%s",
|
||||
cleanURL(n.ChildByFieldName("left"), source),
|
||||
cleanURL(n.ChildByFieldName("right"), source),
|
||||
)
|
||||
case "string":
|
||||
return dequote(content(n, source))
|
||||
default:
|
||||
return "EXPR"
|
||||
}
|
||||
}
|
||||
|
||||
func dequote(in string) string {
|
||||
return strings.Trim(in, "'\"`")
|
||||
}
|
||||
|
||||
func query(n *sitter.Node, query string, enter func(*sitter.Node)) {
|
||||
q, err := sitter.NewQuery(
|
||||
[]byte(query),
|
||||
javascript.GetLanguage(),
|
||||
)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
qc := sitter.NewQueryCursor()
|
||||
defer qc.Close()
|
||||
|
||||
qc.Exec(q, n)
|
||||
|
||||
for {
|
||||
match, exists := qc.NextMatch()
|
||||
if !exists || match == nil {
|
||||
break
|
||||
}
|
||||
|
||||
for _, capture := range match.Captures {
|
||||
enter(capture.Node)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func walk(n *sitter.Node, enter func(*sitter.Node)) {
|
||||
|
||||
c := sitter.NewTreeCursor(n)
|
||||
defer c.Close()
|
||||
|
||||
// walkies
|
||||
recurse := true
|
||||
for {
|
||||
// descend into the tree
|
||||
if recurse && c.GoToFirstChild() {
|
||||
recurse = true
|
||||
enter(c.CurrentNode())
|
||||
continue
|
||||
}
|
||||
|
||||
// move sideways
|
||||
if c.GoToNextSibling() {
|
||||
recurse = true
|
||||
enter(c.CurrentNode())
|
||||
continue
|
||||
}
|
||||
|
||||
// climb back up the tree, but make sure we don't descend right back to where we were
|
||||
if c.GoToParent() {
|
||||
recurse = false
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func PrintTree(source []byte) {
|
||||
parser := sitter.NewParser()
|
||||
parser.SetLanguage(javascript.GetLanguage())
|
||||
|
||||
tree := parser.Parse(nil, source)
|
||||
root := tree.RootNode()
|
||||
|
||||
PrettyPrint(root, source)
|
||||
}
|
||||
|
||||
func PrettyPrint(n *sitter.Node, source []byte) {
|
||||
|
||||
c := sitter.NewTreeCursor(n)
|
||||
defer c.Close()
|
||||
|
||||
// walkies
|
||||
depth := 0
|
||||
recurse := true
|
||||
for {
|
||||
if recurse && c.CurrentNode().IsNamed() {
|
||||
fieldName := c.CurrentFieldName()
|
||||
if fieldName != "" {
|
||||
fieldName += ": "
|
||||
}
|
||||
|
||||
contentStr := ""
|
||||
if c.CurrentNode().ChildCount() == 0 || c.CurrentNode().Type() == "string" {
|
||||
contentStr = fmt.Sprintf(" (%s)", content(c.CurrentNode(), source))
|
||||
}
|
||||
fmt.Printf("%s%s%s%s\n", strings.Repeat(" ", depth), fieldName, c.CurrentNode().Type(), contentStr)
|
||||
}
|
||||
|
||||
// descend into the tree
|
||||
if recurse && c.GoToFirstChild() {
|
||||
recurse = true
|
||||
depth++
|
||||
continue
|
||||
}
|
||||
|
||||
// move sideways
|
||||
if c.GoToNextSibling() {
|
||||
recurse = true
|
||||
continue
|
||||
}
|
||||
|
||||
// climb back up the tree, but make sure we don't descend right back to where we were
|
||||
if c.GoToParent() {
|
||||
depth--
|
||||
recurse = false
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
}
|
||||
45
tree_test.go
Normal file
45
tree_test.go
Normal file
@@ -0,0 +1,45 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"github.com/smacker/go-tree-sitter/javascript"
|
||||
)
|
||||
|
||||
func TestCleanURL(t *testing.T) {
|
||||
cases := []struct {
|
||||
JS []byte
|
||||
Expected string
|
||||
}{
|
||||
{[]byte(`"./login.php?redirect="+url`), "./login.php?redirect=EXPR"},
|
||||
{[]byte(`'/path/'+['one', 'two', 'three'].join('/')`), "/path/EXPR"},
|
||||
{[]byte(`someVar`), "EXPR"},
|
||||
}
|
||||
|
||||
parser := sitter.NewParser()
|
||||
parser.SetLanguage(javascript.GetLanguage())
|
||||
|
||||
for i, c := range cases {
|
||||
t.Run(strconv.Itoa(i), func(t *testing.T) {
|
||||
tree := parser.Parse(nil, c.JS)
|
||||
root := tree.RootNode()
|
||||
|
||||
// Example tree:
|
||||
// program
|
||||
// expression_statement
|
||||
// binary_expression
|
||||
// left: string ("./login.php?redirect=")
|
||||
// right: identifier (url)
|
||||
//
|
||||
// We want the binary_expression to pass to cleanURL, which is
|
||||
// the first Named Child of the first Named Child of the root node.
|
||||
actual := cleanURL(root.NamedChild(0).NamedChild(0), c.JS)
|
||||
|
||||
if actual != c.Expected {
|
||||
t.Errorf("want %s for cleanURL(%s), have: %s", c.Expected, c.JS, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
119
url-match-jquery.go
Normal file
119
url-match-jquery.go
Normal file
@@ -0,0 +1,119 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
func matchJQuery() urlMatcher {
|
||||
|
||||
return urlMatcher{"call_expression", func(n *sitter.Node, source []byte) *URL {
|
||||
callName := content(n.ChildByFieldName("function"), source)
|
||||
|
||||
if !slices.Contains(
|
||||
[]string{
|
||||
"$.get", "$.post", "$.ajax",
|
||||
"jQuery.get", "jQuery.post", "jQuery.ajax",
|
||||
},
|
||||
callName,
|
||||
) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// The jQuery ajax calls have a few different call signatures
|
||||
// that we need to account for:
|
||||
// jQuery.post( url [, data ] [, success ] [, dataType ] )
|
||||
// jQuery.get( url [, data ] [, success ] [, dataType ] )
|
||||
// jQuery.ajax( url [, settings ] )
|
||||
// jQuery.post( [settings] )
|
||||
// jQuery.get( [settings] )
|
||||
// jQuery.ajax( [settings] )
|
||||
//
|
||||
// So we end up with three scenarios to deal with:
|
||||
// 1. The URL comes first, then a data object
|
||||
// 2. The URL comes first, then a settings object
|
||||
// 3. A settings object comes first.
|
||||
arguments := n.ChildByFieldName("arguments")
|
||||
if arguments == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
firstArg := arguments.NamedChild(0)
|
||||
if firstArg == nil {
|
||||
return nil
|
||||
}
|
||||
secondArg := arguments.NamedChild(1)
|
||||
|
||||
m := &URL{
|
||||
Type: callName,
|
||||
Source: content(n, source),
|
||||
}
|
||||
|
||||
// Infer the method for .post and .get calls
|
||||
if strings.HasSuffix(callName, ".post") {
|
||||
m.Method = "POST"
|
||||
} else if strings.HasSuffix(callName, ".get") {
|
||||
m.Method = "GET"
|
||||
}
|
||||
|
||||
var settingsNode *sitter.Node
|
||||
|
||||
if isStringy(firstArg, source) {
|
||||
// first argument is the URL
|
||||
m.URL = cleanURL(firstArg, source)
|
||||
|
||||
// If the first arg is a URL, the second arg is a
|
||||
// settings object for $.ajax, or a data object for
|
||||
// $.get and $.post
|
||||
if strings.HasSuffix(callName, ".ajax") {
|
||||
settingsNode = secondArg
|
||||
} else {
|
||||
params := newObject(secondArg, source).getKeys()
|
||||
if m.Method == "GET" {
|
||||
m.QueryParams = params
|
||||
} else {
|
||||
m.BodyParams = params
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if firstArg.Type() == "object" {
|
||||
// first argument is a settings object
|
||||
settingsNode = firstArg
|
||||
}
|
||||
|
||||
if settingsNode == nil {
|
||||
// we didn't end up with a settings node,
|
||||
// so we can't infer anything else
|
||||
return m
|
||||
}
|
||||
|
||||
settings := newObject(settingsNode, source)
|
||||
|
||||
if m.URL == "" {
|
||||
m.URL = cleanURL(settings.getNode("url"), source)
|
||||
}
|
||||
|
||||
m.Headers = settings.getObject("headers").asMap()
|
||||
|
||||
if m.Method == "" {
|
||||
// method can be specified as either `method`, or
|
||||
// `type`, and defaults to GET
|
||||
m.Method = settings.getString(
|
||||
"method",
|
||||
settings.getString("type", "GET"),
|
||||
)
|
||||
}
|
||||
|
||||
params := settings.getObject("data").getKeys()
|
||||
if m.Method == "GET" {
|
||||
m.QueryParams = params
|
||||
} else {
|
||||
m.BodyParams = params
|
||||
}
|
||||
|
||||
return m
|
||||
}}
|
||||
}
|
||||
167
url-match-xhr.go
Normal file
167
url-match-xhr.go
Normal file
@@ -0,0 +1,167 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
"golang.org/x/exp/slices"
|
||||
)
|
||||
|
||||
type nodeCache struct {
|
||||
sync.RWMutex
|
||||
data map[*sitter.Node][]*sitter.Node
|
||||
}
|
||||
|
||||
func newNodeCache() *nodeCache {
|
||||
return &nodeCache{
|
||||
data: make(map[*sitter.Node][]*sitter.Node),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *nodeCache) set(k *sitter.Node, v []*sitter.Node) {
|
||||
c.Lock()
|
||||
c.data[k] = v
|
||||
c.Unlock()
|
||||
}
|
||||
|
||||
func (c *nodeCache) get(k *sitter.Node) ([]*sitter.Node, bool) {
|
||||
c.RLock()
|
||||
v, exists := c.data[k]
|
||||
c.RUnlock()
|
||||
return v, exists
|
||||
}
|
||||
|
||||
func matchXHR() urlMatcher {
|
||||
cache := newNodeCache()
|
||||
|
||||
return urlMatcher{"call_expression", func(n *sitter.Node, source []byte) *URL {
|
||||
callName := content(n.ChildByFieldName("function"), source)
|
||||
|
||||
// We don't know what the XMLHttpRequest object will be called,
|
||||
// so we have to focus on just the .open bit
|
||||
if !strings.HasSuffix(callName, ".open") {
|
||||
return nil
|
||||
}
|
||||
|
||||
// There's a bunch of different stuff we might have matched,
|
||||
// including window.open, so we're going to try and guess
|
||||
// based on the first argument being a valid HTTP method.
|
||||
// This will miss cases where the method is a variable.
|
||||
arguments := n.ChildByFieldName("arguments")
|
||||
|
||||
method := dequote(content(arguments.NamedChild(0), source))
|
||||
|
||||
if !slices.Contains(
|
||||
[]string{"GET", "HEAD", "OPTIONS", "POST", "PUT", "PATCH", "DELETE"},
|
||||
method,
|
||||
) {
|
||||
return nil
|
||||
}
|
||||
|
||||
urlArg := arguments.NamedChild(1)
|
||||
if !isStringy(urlArg, source) {
|
||||
return nil
|
||||
}
|
||||
|
||||
match := &URL{
|
||||
URL: cleanURL(urlArg, source),
|
||||
Method: method,
|
||||
Type: "XMLHttpRequest.open",
|
||||
Source: content(n, source),
|
||||
}
|
||||
|
||||
// to find headers we need to look for calls to setRequestHeader() on
|
||||
// the same object as the .open call. We'll stick to the same scope
|
||||
// (i.e. sibling expressions) because we have no way to know if we're
|
||||
// dealing with the same object or not otherwise.
|
||||
objectName := strings.TrimSuffix(callName, ".open")
|
||||
|
||||
// We want to find the parent/ancestor node that defines the scope in which
|
||||
// we are calling XHR.open(). JavaScript has three types of scope: global,
|
||||
// function, and block. Block scope only comes into play if values
|
||||
// are defined using 'let', or 'const'. We don't know if the XHR object
|
||||
// was defined with let or const, so we're just going to ignore block scope.
|
||||
// That leaves us with global scope and function scope. To find those we
|
||||
// can ascend the tree until we hit a node with type "function_declaration",
|
||||
// or we hit a nil parent.
|
||||
parent := n.Parent()
|
||||
if parent == nil {
|
||||
return match
|
||||
}
|
||||
for {
|
||||
candidate := parent.Parent()
|
||||
if candidate == nil {
|
||||
break
|
||||
}
|
||||
parent = candidate
|
||||
if parent.Type() == "function_declaration" {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Look for call_expressions under the same parent as our .open call.
|
||||
// It's common to end up querying the exact same parent over and over
|
||||
// again, so we cache the results on a per-parent node basis.
|
||||
nodes := make([]*sitter.Node, 0)
|
||||
if v, exists := cache.get(parent); exists {
|
||||
nodes = v
|
||||
} else {
|
||||
q := `
|
||||
(call_expression
|
||||
function: (member_expression
|
||||
object: (identifier)
|
||||
property: (property_identifier)
|
||||
)
|
||||
arguments: (arguments (string))
|
||||
) @matches
|
||||
`
|
||||
query(parent, q, func(sibling *sitter.Node) {
|
||||
nodes = append(nodes, sibling)
|
||||
})
|
||||
cache.set(parent, nodes)
|
||||
}
|
||||
|
||||
headers := make(map[string]string, 0)
|
||||
// TODO: I think we can get more accuracy here by relying on the fact that
|
||||
// the .setRequestHeader calls we're interested in must come *after* the .open
|
||||
// call in order to be valid. In theory that means we can skip any nodes at
|
||||
// all that come before the .open call we're currently looking at. We could
|
||||
// also stop looking after we see a .send call on the same object, although
|
||||
// it's possible for the .send to be wrapped in a conditional so that might
|
||||
// cause us to miss some values.
|
||||
for _, sibling := range nodes {
|
||||
name := content(sibling.ChildByFieldName("function"), source)
|
||||
if !strings.HasSuffix(name, ".setRequestHeader") {
|
||||
continue
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(name, objectName) {
|
||||
continue
|
||||
}
|
||||
|
||||
args := sibling.ChildByFieldName("arguments")
|
||||
headerNode := args.NamedChild(0)
|
||||
if headerNode == nil || headerNode.Type() != "string" {
|
||||
continue
|
||||
}
|
||||
|
||||
header := dequote(content(headerNode, source))
|
||||
if _, exists := headers[header]; exists {
|
||||
continue
|
||||
}
|
||||
|
||||
var value string
|
||||
valueNode := args.NamedChild(1)
|
||||
if valueNode != nil && valueNode.Type() == "string" {
|
||||
value = dequote(content(valueNode, source))
|
||||
}
|
||||
|
||||
headers[header] = value
|
||||
}
|
||||
|
||||
match.Headers = headers
|
||||
|
||||
return match
|
||||
}}
|
||||
}
|
||||
284
url-matchers.go
Normal file
284
url-matchers.go
Normal file
@@ -0,0 +1,284 @@
|
||||
package jsurls
|
||||
|
||||
import (
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
sitter "github.com/smacker/go-tree-sitter"
|
||||
)
|
||||
|
||||
// a URL is any URL found in the source code with accompanying details
|
||||
type URL struct {
|
||||
URL string `json:"url"`
|
||||
QueryParams []string `json:"queryParams"`
|
||||
BodyParams []string `json:"bodyParams"`
|
||||
Method string `json:"method"`
|
||||
Headers map[string]string `json:"headers,omitempty"`
|
||||
ContentType string `json:"contentType,omitempty"`
|
||||
|
||||
// some description like locationAssignment, fetch, $.post or something like that
|
||||
Type string `json:"type"`
|
||||
|
||||
// full source/content of the node; is optional
|
||||
Source string `json:"source,omitempty"`
|
||||
|
||||
// the filename in which the match was found
|
||||
Filename string `json:"filename,omitempty"`
|
||||
}
|
||||
|
||||
// GetURLs searches the JavaScript source code for absolute and relative URLs and returns
|
||||
// a slice of results.
|
||||
func (a *Analyzer) GetURLs() []*URL {
|
||||
|
||||
matches := make([]*URL, 0)
|
||||
|
||||
re := regexp.MustCompile("[^A-Z-a-z]")
|
||||
|
||||
// function to run on entry to each node in the tree
|
||||
enter := func(n *sitter.Node) {
|
||||
|
||||
for _, matcher := range a.urlMatchers {
|
||||
if matcher.Type != n.Type() {
|
||||
continue
|
||||
}
|
||||
|
||||
match := matcher.Fn(n, a.source)
|
||||
if match == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// decode any escapes in the URL
|
||||
match.URL = DecodeString(match.URL)
|
||||
|
||||
// an empty slice is easier to deal with than null, e.g when using jq
|
||||
if match.QueryParams == nil {
|
||||
match.QueryParams = []string{}
|
||||
}
|
||||
if match.BodyParams == nil {
|
||||
match.BodyParams = []string{}
|
||||
}
|
||||
|
||||
// Filter out data: and tel: schemes
|
||||
lower := strings.ToLower(match.URL)
|
||||
if strings.HasPrefix(lower, "data:") || strings.HasPrefix(lower, "tel:") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Look for URLs that are entirely made up of EXPR replacements
|
||||
// and skip them. Maybe this should be optional? Maybe it should
|
||||
// remove things like EXPR#EXPR etc too
|
||||
letters := re.ReplaceAllString(match.URL, "")
|
||||
if strings.ReplaceAll(letters, "EXPR", "") == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse any query params out of the URL and add them. Some, but not
|
||||
// all of the matchers will add query params, so we want to do it here
|
||||
// and then remove duplicates
|
||||
u, err := url.Parse(match.URL)
|
||||
if err == nil {
|
||||
// manually disallow www.w3.org just because it shows up so damn often
|
||||
if u.Hostname() == "www.w3.org" {
|
||||
continue
|
||||
}
|
||||
|
||||
for p, _ := range u.Query() {
|
||||
// Ignore params that were expressions
|
||||
if p == "EXPR" {
|
||||
continue
|
||||
}
|
||||
match.QueryParams = append(match.QueryParams, p)
|
||||
}
|
||||
}
|
||||
match.QueryParams = unique(match.QueryParams)
|
||||
|
||||
matches = append(matches, match)
|
||||
}
|
||||
}
|
||||
|
||||
// find the nodes we need in the the tree and run the enter function for every node
|
||||
query(a.rootNode, "[(assignment_expression) (call_expression) (string)] @matches", enter)
|
||||
|
||||
return matches
|
||||
}
|
||||
|
||||
func unique[T comparable](items []T) []T {
|
||||
set := make(map[T]any)
|
||||
for _, item := range items {
|
||||
set[item] = struct{}{}
|
||||
}
|
||||
out := make([]T, len(set))
|
||||
i := 0
|
||||
for item, _ := range set {
|
||||
out[i] = item
|
||||
i++
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// a URLMatcher has a type of thing it matches against (e.g. assignment_expression),
|
||||
// and a function to actually do the matching and producing of the *URL
|
||||
type URLMatcher struct {
|
||||
Type string
|
||||
Fn func(*sitter.Node, []byte) *URL
|
||||
}
|
||||
|
||||
func AllURLMatchers() []URLMatcher {
|
||||
|
||||
assignmentNames := newSet([]string{
|
||||
"location",
|
||||
"this.url",
|
||||
"this._url",
|
||||
"this.baseUrl",
|
||||
})
|
||||
|
||||
isInterestingAssignment := func(name string) bool {
|
||||
if assignmentNames.Contains(name) {
|
||||
return true
|
||||
}
|
||||
|
||||
if strings.HasSuffix(name, ".href") {
|
||||
return true
|
||||
}
|
||||
|
||||
if strings.HasSuffix(name, ".location") {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
matchers := []URLMatcher{
|
||||
// XMLHttpRequest.open(method, url)
|
||||
matchXHR(),
|
||||
|
||||
// $.post, $.get, and $.ajax
|
||||
matchJQuery(),
|
||||
|
||||
// location assignment
|
||||
{"assignment_expression", func(n *sitter.Node, source []byte) *URL {
|
||||
left := n.ChildByFieldName("left")
|
||||
right := n.ChildByFieldName("right")
|
||||
|
||||
if !isInterestingAssignment(content(left, source)) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// We want to find values that at least *start* with a string of some kind.
|
||||
// This might be kind of useful to the crawler:
|
||||
//
|
||||
// location.href = "/somePath/" + someVar;
|
||||
//
|
||||
// Where as this tends to end up being kind of useless:
|
||||
//
|
||||
// location.href = someVar + "/somePath/";
|
||||
//
|
||||
// So while we might miss out on some things this way, they probably wouldn't
|
||||
// have been super useful to anything automated anyway.
|
||||
rightContent := content(right, source)
|
||||
if len(rightContent) < 2 {
|
||||
return nil
|
||||
}
|
||||
p := rightContent[0:1]
|
||||
if p != `"` && p != "'" && p != "`" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &URL{
|
||||
URL: cleanURL(right, source),
|
||||
Method: "GET",
|
||||
Type: "locationAssignment",
|
||||
Source: content(n, source),
|
||||
}
|
||||
}},
|
||||
|
||||
// location replacement
|
||||
{"call_expression", func(n *sitter.Node, source []byte) *URL {
|
||||
callName := content(n.ChildByFieldName("function"), source)
|
||||
|
||||
if !strings.HasSuffix(callName, "location.replace") {
|
||||
return nil
|
||||
}
|
||||
|
||||
arguments := n.ChildByFieldName("arguments")
|
||||
|
||||
// check the argument contains at least one string literal
|
||||
if !hasDescendantOfType(arguments.NamedChild(0), "string") {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &URL{
|
||||
URL: cleanURL(arguments.NamedChild(0), source),
|
||||
Method: "GET",
|
||||
Type: "locationReplacement",
|
||||
Source: content(n, source),
|
||||
}
|
||||
}},
|
||||
|
||||
// window.open(url)
|
||||
{"call_expression", func(n *sitter.Node, source []byte) *URL {
|
||||
callName := content(n.ChildByFieldName("function"), source)
|
||||
if callName != "window.open" && callName != "open" {
|
||||
return nil
|
||||
}
|
||||
arguments := n.ChildByFieldName("arguments")
|
||||
|
||||
// check the argument contains at least one string literal
|
||||
if !hasDescendantOfType(arguments.NamedChild(0), "string") {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &URL{
|
||||
URL: cleanURL(arguments.NamedChild(0), source),
|
||||
Method: "GET",
|
||||
Type: "window.open",
|
||||
Source: content(n, source),
|
||||
}
|
||||
return nil
|
||||
}},
|
||||
|
||||
// fetch(url, [init])
|
||||
{"call_expression", func(n *sitter.Node, source []byte) *URL {
|
||||
callName := content(n.ChildByFieldName("function"), source)
|
||||
if callName != "fetch" {
|
||||
return nil
|
||||
}
|
||||
arguments := n.ChildByFieldName("arguments")
|
||||
|
||||
// check the argument contains at least one string literal
|
||||
if !hasDescendantOfType(arguments.NamedChild(0), "string") {
|
||||
return nil
|
||||
}
|
||||
|
||||
init := newObject(arguments.NamedChild(1), source)
|
||||
|
||||
return &URL{
|
||||
URL: cleanURL(arguments.NamedChild(0), source),
|
||||
Method: init.getString("method", "GET"),
|
||||
Headers: init.getObject("headers").asMap(),
|
||||
ContentType: init.getObject("headers").getStringI("content-type", ""),
|
||||
Type: "fetch",
|
||||
Source: content(n, source),
|
||||
}
|
||||
return nil
|
||||
}},
|
||||
|
||||
// string literals
|
||||
{"string", func(n *sitter.Node, source []byte) *URL {
|
||||
trimmed := dequote(content(n, source))
|
||||
|
||||
if !MaybeURL(trimmed) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &URL{
|
||||
URL: trimmed,
|
||||
Type: "stringLiteral",
|
||||
Source: content(n, source),
|
||||
}
|
||||
}},
|
||||
}
|
||||
|
||||
return matchers
|
||||
}
|
||||
Reference in New Issue
Block a user