Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified docker/sswlinkauditor
Binary file not shown.
33 changes: 25 additions & 8 deletions docker/sswlinkauditor.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,24 @@ func isSameOrigin(url1 string, url2 string) bool {
}

func isSameOriginAndPath(baseUrl string, targetUrl string) bool {
// Normalize URLs by ensuring they have trailing slashes for comparison
normalizedBase := strings.TrimRight(baseUrl, "/") + "/"
normalizedTarget := strings.TrimRight(targetUrl, "/") + "/"
// Normalize URLs by removing trailing slashes for comparison
normalizedBase := strings.TrimRight(baseUrl, "/")
normalizedTarget := strings.TrimRight(targetUrl, "/")

return strings.Index(normalizedTarget, normalizedBase) == 0
// Check if target starts with base
if !strings.HasPrefix(normalizedTarget, normalizedBase) {
return false
}

// If exact match, return true
if normalizedTarget == normalizedBase {
return true
}

// Check that what comes after the base is a path separator, query, or fragment
// This prevents false positives like "/api" matching "/api-v2"
remainder := normalizedTarget[len(normalizedBase):]
return strings.HasPrefix(remainder, "/") || strings.HasPrefix(remainder, "?") || strings.HasPrefix(remainder, "#")
}

func crawl(link Link, ch chan Link, linkch chan LinkStatus, number int) {
Expand Down Expand Up @@ -319,13 +332,17 @@ func parseUrl(startUrl string, url string) string {
if len(filenameRegex.FindStringSubmatch(UrlPath)) > 0 {
fileName := filenameRegex.FindStringSubmatch(UrlPath)[0]
UrlPath = strings.ReplaceAll(UrlPath, fileName, "")
} else if !strings.HasSuffix(UrlPath, "/") && UrlPath != "" {
// If the path doesn't end with / and isn't empty, it's a document not a directory
// Get the parent directory for relative link resolution
lastSlash := strings.LastIndex(UrlPath, "/")
if lastSlash >= 0 {
UrlPath = UrlPath[:lastSlash+1]
}
}

baseUrl := sUrl.Scheme + "://" + sUrl.Hostname() + UrlPath
if !strings.HasSuffix(baseUrl, "/") {
baseUrl = baseUrl + "/"
}


u, _ := urlP.Parse(baseUrl)
u.Path = path.Join(u.Path, url)
return u.String()
Expand Down
242 changes: 242 additions & 0 deletions docker/sswlinkauditor_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
package main

import (
urlP "net/url"
"testing"
)

func TestParseUrl(t *testing.T) {
tests := []struct {
name string
startUrl string
url string
expected string
}{
{
name: "Absolute URL - no change",
startUrl: "https://example.com/page",
url: "https://other.com/path",
expected: "https://other.com/path",
},
{
name: "Protocol-relative URL",
startUrl: "https://example.com/page",
url: "//cdn.example.com/resource.js",
expected: "https://cdn.example.com/resource.js",
},
{
name: "Root-relative URL",
startUrl: "https://example.com/some/page",
url: "/about",
expected: "https://example.com/about",
},
{
name: "Relative URL from page with trailing slash",
startUrl: "https://example.com/blog/",
url: "post",
expected: "https://example.com/blog/post",
},
{
name: "Relative URL from page without trailing slash",
startUrl: "https://example.com/blog",
url: "post",
expected: "https://example.com/post", // blog is a page, so resolve relative to parent
},
{
name: "Relative URL from directory with trailing slash",
startUrl: "https://example.com/blog/",
url: "post",
expected: "https://example.com/blog/post",
},
{
name: "Relative URL from page with extension",
startUrl: "https://example.com/blog/index.html",
url: "about.html",
expected: "https://example.com/blog/about.html",
},
{
name: "URL with fragment - fragment removed",
startUrl: "https://example.com/page",
url: "https://example.com/other#section",
expected: "https://example.com/other",
},
{
name: "Deep relative URL",
startUrl: "https://example.com/a/b/c/page.html",
url: "d/e/file.html",
expected: "https://example.com/a/b/c/d/e/file.html",
},
{
name: "Relative URL with .aspx extension in startUrl",
startUrl: "https://example.com/products/list.aspx",
url: "detail",
expected: "https://example.com/products/detail",
},
{
name: "Root path should not add trailing slash",
startUrl: "https://example.com",
url: "page",
expected: "https://example.com/page",
},
{
name: "SSW Rules page - relative link without extension",
startUrl: "https://www.ssw.com.au/rules/best-way-to-display-code-on-your-website",
url: "set-language-on-code-blocks",
expected: "https://www.ssw.com.au/rules/set-language-on-code-blocks",
},
{
name: "Root level page - relative link",
startUrl: "https://example.com/page",
url: "other",
expected: "https://example.com/other",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := parseUrl(tt.startUrl, tt.url)
if result != tt.expected {
t.Errorf("parseUrl(%q, %q) = %q; want %q", tt.startUrl, tt.url, result, tt.expected)
}
})
}
}

func TestParseUrl_NoTrailingSlashAdded(t *testing.T) {
// Specifically test that we don't unconditionally add trailing slashes
tests := []struct {
name string
startUrl string
url string
wantPath string
}{
{
name: "Should not add slash to base without directory",
startUrl: "https://example.com/page",
url: "other",
wantPath: "/other", // path.Join should resolve this correctly
},
{
name: "Should preserve path structure",
startUrl: "https://example.com/api/v1",
url: "users",
wantPath: "/api/users", // Should not become /api/v1/users
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := parseUrl(tt.startUrl, tt.url)
// Check that the path portion matches expectations
if !containsPath(result, tt.wantPath) {
t.Errorf("parseUrl(%q, %q) = %q; expected path to contain %q", tt.startUrl, tt.url, result, tt.wantPath)
}
})
}
}

func TestIsSameOriginAndPath(t *testing.T) {
tests := []struct {
name string
baseUrl string
targetUrl string
expected bool
}{
{
name: "Exact match",
baseUrl: "https://example.com/blog",
targetUrl: "https://example.com/blog",
expected: true,
},
{
name: "Exact match with trailing slash on both",
baseUrl: "https://example.com/blog/",
targetUrl: "https://example.com/blog/",
expected: true,
},
{
name: "Match with one trailing slash",
baseUrl: "https://example.com/blog",
targetUrl: "https://example.com/blog/",
expected: true,
},
{
name: "Child path with separator",
baseUrl: "https://example.com/blog",
targetUrl: "https://example.com/blog/post",
expected: true,
},
{
name: "Deep child path",
baseUrl: "https://example.com/blog",
targetUrl: "https://example.com/blog/2024/01/post",
expected: true,
},
{
name: "Same prefix but different path - should NOT match",
baseUrl: "https://example.com/api",
targetUrl: "https://example.com/api-v2",
expected: false,
},
{
name: "Same prefix with hyphen - should NOT match",
baseUrl: "https://example.com/blog",
targetUrl: "https://example.com/blogpost",
expected: false,
},
{
name: "Different domain - should NOT match",
baseUrl: "https://example.com/blog",
targetUrl: "https://other.com/blog",
expected: false,
},
{
name: "Base is longer - should NOT match",
baseUrl: "https://example.com/blog/post",
targetUrl: "https://example.com/blog",
expected: false,
},
{
name: "Query string continuation",
baseUrl: "https://example.com/search",
targetUrl: "https://example.com/search?q=test",
expected: true,
},
{
name: "Fragment continuation",
baseUrl: "https://example.com/page",
targetUrl: "https://example.com/page#section",
expected: true,
},
{
name: "Root path",
baseUrl: "https://example.com/",
targetUrl: "https://example.com/anything",
expected: true,
},
{
name: "Empty path base",
baseUrl: "https://example.com",
targetUrl: "https://example.com/page",
expected: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := isSameOriginAndPath(tt.baseUrl, tt.targetUrl)
if result != tt.expected {
t.Errorf("isSameOriginAndPath(%q, %q) = %v; want %v", tt.baseUrl, tt.targetUrl, result, tt.expected)
}
})
}
}

// Helper function to check if a URL contains a specific path
func containsPath(url, wantPath string) bool {
parsed, err := urlP.Parse(url)
if err != nil {
return false
}
return parsed.Path == wantPath
}