diff --git a/docker/sswlinkauditor b/docker/sswlinkauditor index 2512bf62..62df5815 100755 Binary files a/docker/sswlinkauditor and b/docker/sswlinkauditor differ diff --git a/docker/sswlinkauditor.go b/docker/sswlinkauditor.go index 5840da6c..ebbeff4f 100644 --- a/docker/sswlinkauditor.go +++ b/docker/sswlinkauditor.go @@ -198,11 +198,24 @@ func isSameOrigin(url1 string, url2 string) bool { } func isSameOriginAndPath(baseUrl string, targetUrl string) bool { - // Normalize URLs by ensuring they have trailing slashes for comparison - normalizedBase := strings.TrimRight(baseUrl, "/") + "/" - normalizedTarget := strings.TrimRight(targetUrl, "/") + "/" + // Normalize URLs by removing trailing slashes for comparison + normalizedBase := strings.TrimRight(baseUrl, "/") + normalizedTarget := strings.TrimRight(targetUrl, "/") - return strings.Index(normalizedTarget, normalizedBase) == 0 + // Check if target starts with base + if !strings.HasPrefix(normalizedTarget, normalizedBase) { + return false + } + + // If exact match, return true + if normalizedTarget == normalizedBase { + return true + } + + // Check that what comes after the base is a path separator, query, or fragment + // This prevents false positives like "/api" matching "/api-v2" + remainder := normalizedTarget[len(normalizedBase):] + return strings.HasPrefix(remainder, "/") || strings.HasPrefix(remainder, "?") || strings.HasPrefix(remainder, "#") } func crawl(link Link, ch chan Link, linkch chan LinkStatus, number int) { @@ -319,13 +332,17 @@ func parseUrl(startUrl string, url string) string { if len(filenameRegex.FindStringSubmatch(UrlPath)) > 0 { fileName := filenameRegex.FindStringSubmatch(UrlPath)[0] UrlPath = strings.ReplaceAll(UrlPath, fileName, "") + } else if !strings.HasSuffix(UrlPath, "/") && UrlPath != "" { + // If the path doesn't end with / and isn't empty, it's a document not a directory + // Get the parent directory for relative link resolution + lastSlash := strings.LastIndex(UrlPath, "/") + if lastSlash >= 0 { + UrlPath = UrlPath[:lastSlash+1] + } } baseUrl := sUrl.Scheme + "://" + sUrl.Hostname() + UrlPath - if !strings.HasSuffix(baseUrl, "/") { - baseUrl = baseUrl + "/" - } - + u, _ := urlP.Parse(baseUrl) u.Path = path.Join(u.Path, url) return u.String() diff --git a/docker/sswlinkauditor_test.go b/docker/sswlinkauditor_test.go new file mode 100644 index 00000000..7d6ddb08 --- /dev/null +++ b/docker/sswlinkauditor_test.go @@ -0,0 +1,242 @@ +package main + +import ( + urlP "net/url" + "testing" +) + +func TestParseUrl(t *testing.T) { + tests := []struct { + name string + startUrl string + url string + expected string + }{ + { + name: "Absolute URL - no change", + startUrl: "https://example.com/page", + url: "https://other.com/path", + expected: "https://other.com/path", + }, + { + name: "Protocol-relative URL", + startUrl: "https://example.com/page", + url: "//cdn.example.com/resource.js", + expected: "https://cdn.example.com/resource.js", + }, + { + name: "Root-relative URL", + startUrl: "https://example.com/some/page", + url: "/about", + expected: "https://example.com/about", + }, + { + name: "Relative URL from page with trailing slash", + startUrl: "https://example.com/blog/", + url: "post", + expected: "https://example.com/blog/post", + }, + { + name: "Relative URL from page without trailing slash", + startUrl: "https://example.com/blog", + url: "post", + expected: "https://example.com/post", // blog is a page, so resolve relative to parent + }, + { + name: "Relative URL from directory with trailing slash", + startUrl: "https://example.com/blog/", + url: "post", + expected: "https://example.com/blog/post", + }, + { + name: "Relative URL from page with extension", + startUrl: "https://example.com/blog/index.html", + url: "about.html", + expected: "https://example.com/blog/about.html", + }, + { + name: "URL with fragment - fragment removed", + startUrl: "https://example.com/page", + url: "https://example.com/other#section", + expected: "https://example.com/other", + }, + { + name: "Deep relative URL", + startUrl: "https://example.com/a/b/c/page.html", + url: "d/e/file.html", + expected: "https://example.com/a/b/c/d/e/file.html", + }, + { + name: "Relative URL with .aspx extension in startUrl", + startUrl: "https://example.com/products/list.aspx", + url: "detail", + expected: "https://example.com/products/detail", + }, + { + name: "Root path should not add trailing slash", + startUrl: "https://example.com", + url: "page", + expected: "https://example.com/page", + }, + { + name: "SSW Rules page - relative link without extension", + startUrl: "https://www.ssw.com.au/rules/best-way-to-display-code-on-your-website", + url: "set-language-on-code-blocks", + expected: "https://www.ssw.com.au/rules/set-language-on-code-blocks", + }, + { + name: "Root level page - relative link", + startUrl: "https://example.com/page", + url: "other", + expected: "https://example.com/other", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := parseUrl(tt.startUrl, tt.url) + if result != tt.expected { + t.Errorf("parseUrl(%q, %q) = %q; want %q", tt.startUrl, tt.url, result, tt.expected) + } + }) + } +} + +func TestParseUrl_NoTrailingSlashAdded(t *testing.T) { + // Specifically test that we don't unconditionally add trailing slashes + tests := []struct { + name string + startUrl string + url string + wantPath string + }{ + { + name: "Should not add slash to base without directory", + startUrl: "https://example.com/page", + url: "other", + wantPath: "/other", // path.Join should resolve this correctly + }, + { + name: "Should preserve path structure", + startUrl: "https://example.com/api/v1", + url: "users", + wantPath: "/api/users", // Should not become /api/v1/users + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := parseUrl(tt.startUrl, tt.url) + // Check that the path portion matches expectations + if !containsPath(result, tt.wantPath) { + t.Errorf("parseUrl(%q, %q) = %q; expected path to contain %q", tt.startUrl, tt.url, result, tt.wantPath) + } + }) + } +} + +func TestIsSameOriginAndPath(t *testing.T) { + tests := []struct { + name string + baseUrl string + targetUrl string + expected bool + }{ + { + name: "Exact match", + baseUrl: "https://example.com/blog", + targetUrl: "https://example.com/blog", + expected: true, + }, + { + name: "Exact match with trailing slash on both", + baseUrl: "https://example.com/blog/", + targetUrl: "https://example.com/blog/", + expected: true, + }, + { + name: "Match with one trailing slash", + baseUrl: "https://example.com/blog", + targetUrl: "https://example.com/blog/", + expected: true, + }, + { + name: "Child path with separator", + baseUrl: "https://example.com/blog", + targetUrl: "https://example.com/blog/post", + expected: true, + }, + { + name: "Deep child path", + baseUrl: "https://example.com/blog", + targetUrl: "https://example.com/blog/2024/01/post", + expected: true, + }, + { + name: "Same prefix but different path - should NOT match", + baseUrl: "https://example.com/api", + targetUrl: "https://example.com/api-v2", + expected: false, + }, + { + name: "Same prefix with hyphen - should NOT match", + baseUrl: "https://example.com/blog", + targetUrl: "https://example.com/blogpost", + expected: false, + }, + { + name: "Different domain - should NOT match", + baseUrl: "https://example.com/blog", + targetUrl: "https://other.com/blog", + expected: false, + }, + { + name: "Base is longer - should NOT match", + baseUrl: "https://example.com/blog/post", + targetUrl: "https://example.com/blog", + expected: false, + }, + { + name: "Query string continuation", + baseUrl: "https://example.com/search", + targetUrl: "https://example.com/search?q=test", + expected: true, + }, + { + name: "Fragment continuation", + baseUrl: "https://example.com/page", + targetUrl: "https://example.com/page#section", + expected: true, + }, + { + name: "Root path", + baseUrl: "https://example.com/", + targetUrl: "https://example.com/anything", + expected: true, + }, + { + name: "Empty path base", + baseUrl: "https://example.com", + targetUrl: "https://example.com/page", + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isSameOriginAndPath(tt.baseUrl, tt.targetUrl) + if result != tt.expected { + t.Errorf("isSameOriginAndPath(%q, %q) = %v; want %v", tt.baseUrl, tt.targetUrl, result, tt.expected) + } + }) + } +} + +// Helper function to check if a URL contains a specific path +func containsPath(url, wantPath string) bool { + parsed, err := urlP.Parse(url) + if err != nil { + return false + } + return parsed.Path == wantPath +}