// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // The linkcheck command finds missing links in the godoc website. // It crawls a URL recursively and notes URLs and URL fragments // that it's seen and prints a report of missing links at the end. package main import ( "errors" "flag" "fmt" "io/ioutil" "log" "net/http" "os" "regexp" "strings" "sync" ) var ( root = flag.String("root", "http://localhost:6060", "Root to crawl") verbose = flag.Bool("verbose", false, "verbose") ) var wg sync.WaitGroup // outstanding fetches var urlq = make(chan string) // URLs to crawl // urlFrag is a URL and its optional #fragment (without the #) type urlFrag struct { url, frag string } var ( mu sync.Mutex crawled = make(map[string]bool) // URL without fragment -> true neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it ) var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`) // Owned by crawlLoop goroutine: var ( linkSources = make(map[string][]string) // url no fragment -> sources fragExists = make(map[urlFrag]bool) problems []string ) func localLinks(body string) (links []string) { seen := map[string]bool{} mv := aRx.FindAllStringSubmatch(body, -1) for _, m := range mv { ref := m[1] if strings.HasPrefix(ref, "/src/") { continue } if !seen[ref] { seen[ref] = true links = append(links, m[1]) } } return } var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`) func pageIDs(body string) (ids []string) { mv := idRx.FindAllStringSubmatch(body, -1) for _, m := range mv { ids = append(ids, m[1]) } return } // url may contain a #fragment, and the fragment is then noted as needing to exist. func crawl(url string, sourceURL string) { if strings.Contains(url, "/devel/release") { return } mu.Lock() defer mu.Unlock() var frag string if i := strings.Index(url, "#"); i >= 0 { frag = url[i+1:] url = url[:i] if frag != "" { uf := urlFrag{url, frag} neededFrags[uf] = append(neededFrags[uf], sourceURL) } } if crawled[url] { return } crawled[url] = true wg.Add(1) go func() { urlq <- url }() } func addProblem(url, errmsg string) { msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) if *verbose { log.Print(msg) } problems = append(problems, msg) } func crawlLoop() { for url := range urlq { if err := doCrawl(url); err != nil { addProblem(url, err.Error()) } } } func doCrawl(url string) error { defer wg.Done() req, err := http.NewRequest("GET", url, nil) if err != nil { return err } res, err := http.DefaultTransport.RoundTrip(req) if err != nil { return err } // Handle redirects. if res.StatusCode/100 == 3 { newURL, err := res.Location() if err != nil { return fmt.Errorf("resolving redirect: %v", err) } if !strings.HasPrefix(newURL.String(), *root) { // Skip off-site redirects. return nil } crawl(newURL.String(), url) return nil } if res.StatusCode != 200 { return errors.New(res.Status) } slurp, err := ioutil.ReadAll(res.Body) res.Body.Close() if err != nil { log.Fatalf("Error reading %s body: %v", url, err) } if *verbose { log.Printf("Len of %s: %d", url, len(slurp)) } body := string(slurp) for _, ref := range localLinks(body) { if *verbose { log.Printf(" links to %s", ref) } dest := *root + ref linkSources[dest] = append(linkSources[dest], url) crawl(dest, url) } for _, id := range pageIDs(body) { if *verbose { log.Printf(" url %s has #%s", url, id) } fragExists[urlFrag{url, id}] = true } return nil } func main() { flag.Parse() go crawlLoop() crawl(*root, "") wg.Wait() close(urlq) for uf, needers := range neededFrags { if !fragExists[uf] { problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers)) } } for _, s := range problems { fmt.Println(s) } if len(problems) > 0 { os.Exit(1) } }