process to check for local dead links, fails the build of pmd-doc if dead links are found

2018-12-26 13:26:18 +01:00
parent 43098c1cf0
commit ca8f43baef
2 changed files with 161 additions and 0 deletions
--- a/pmd-doc/src/main/java/net/sourceforge/pmd/docs/DeadLinksChecker.java
+++ b/pmd-doc/src/main/java/net/sourceforge/pmd/docs/DeadLinksChecker.java
@@ -0,0 +1,158 @@
+/**
+ * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
+ */
+
+package net.sourceforge.pmd.docs;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+
+/**
+ * Checks links to local pages for non-existing link-targets.
+ */
+public class DeadLinksChecker {
+
+    // Markdown-Link: something in []'s followed by something in ()'s
+    private static final Pattern LOCAL_LINK_PATTERN = Pattern.compile("\\[.*?\\]\\((.*?)\\)");
+
+    // Markdown permalink-header and captions
+    private static final Pattern MD_HEADER_PERMALINK = Pattern.compile("permalink:\\s*(.*)");
+    private static final Pattern MD_CAPTION = Pattern.compile("^##+\\s+(.*)$", Pattern.MULTILINE);
+
+    // list of link targets, where the link detection doesn't work
+    private static final Pattern EXCLUDED_LINK_TARGETS = Pattern.compile(
+            "^pmd_userdocs_cli_reference\\.html.*" // anchors in the CLI reference are a plain HTML include
+    );
+
+    public void checkDeadLinks(Path pagesDirectory) {
+        if (!Files.isDirectory(pagesDirectory)) {
+            System.err.println("can't check for dead links, didn't find \"pages\" directory at: " + pagesDirectory);
+            System.exit(1);
+        }
+
+        // read all .md-files in the pages directory
+        final List<Path> mdFiles = listMdFiles(pagesDirectory);
+
+
+        // make a list of all valid link targets
+        final Set<String> htmlPages = extractLinkTargets(mdFiles);
+
+        // scan all .md-files for dead local links
+        Path errorFile = null;
+        int scannedFiles = 0;
+        for (Path mdFile : mdFiles) {
+            final String pageContent = fileToString(mdFile);
+            scannedFiles++;
+
+            // iterate line-by-line for better reporting the line numbers
+            final String[] lines = pageContent.split("\r?\n|\n");
+            for (int index = 0; index < lines.length; index++) {
+                final String line = lines[index];
+                final int lineNo = index + 1;
+
+
+                final Matcher matcher = LOCAL_LINK_PATTERN.matcher(line);
+                while (matcher.find()) {
+                    String linkTarget = matcher.group(1);
+                    linkTarget = linkTarget.replaceAll("^/+", ""); // remove the leading "/"
+
+                    // ignore http/https links
+                    if (linkTarget.startsWith("http://") || linkTarget.startsWith("https://")) {
+                        continue;
+                    }
+
+                    // ignore local anchors
+                    if (linkTarget.startsWith("#")) {
+                        continue;
+                    }
+
+                    // ignore some pages where automatic link detection doesn't work
+                    if (EXCLUDED_LINK_TARGETS.matcher(linkTarget).matches()) {
+                        continue;
+                    }
+
+                    if (!linkTarget.isEmpty() && !htmlPages.contains(linkTarget)) {
+                        if (errorFile == null) {
+                            System.err.println("Found dead link(s):");
+                        }
+                        if (!mdFile.equals(errorFile)) {
+                            System.err.println(mdFile);
+                            errorFile = mdFile;
+                        }
+                        System.err.printf("%8d: %s%n", lineNo, matcher.group());
+                    }
+                }
+            }
+        }
+        if (errorFile != null) {
+            throw new AssertionError("dead links detected");
+        } else {
+            System.out.println("Scanned " + scannedFiles + " files for dead links - no errors found!");
+        }
+    }
+
+    private Set<String> extractLinkTargets(List<Path> mdFiles) {
+        final Set<String> htmlPages = new HashSet<>();
+        for (Path mdFile : mdFiles) {
+            final String pageContent = fileToString(mdFile);
+
+            // extract the permalink header field
+            final Matcher permalinkMatcher = MD_HEADER_PERMALINK.matcher(pageContent);
+            if (!permalinkMatcher.find()) {
+                continue;
+            }
+
+            final String pageUrl = permalinkMatcher.group(1)
+                    .replaceAll("^/+", ""); // remove the leading "/"
+
+            // add the root page
+            htmlPages.add(pageUrl);
+
+            // add all captions as anchors
+            final Matcher captionMatcher = MD_CAPTION.matcher(pageContent);
+            while (captionMatcher.find()) {
+                final String anchor = captionMatcher.group(1)
+                        .toLowerCase(Locale.ROOT)
+                        .replaceAll("[^a-z0-9_]+", "-") // replace all non-alphanumeric characters with dashes
+                        .replaceAll("^-+|-+$", ""); // trim leading or trailing dashes
+
+                htmlPages.add(pageUrl + "#" + anchor);
+            }
+        }
+        return htmlPages;
+    }
+
+    private List<Path> listMdFiles(Path pagesDirectory) {
+        final List<Path> mdFiles = new ArrayList<>();
+        try {
+            Files.walk(pagesDirectory)
+                    .filter(Files::isRegularFile)
+                    .filter(path -> path.toString().endsWith(".md"))
+                    .forEach(mdFiles::add);
+        } catch (IOException ex) {
+            throw new RuntimeException("error listing files in " + pagesDirectory, ex);
+        }
+        return mdFiles;
+    }
+
+    private String fileToString(Path mdFile) {
+        try (InputStream inputStream = Files.newInputStream(mdFile)) {
+            return IOUtils.toString(inputStream, Charset.forName("UTF-8"));
+        } catch (IOException ex) {
+            throw new RuntimeException("error reading " + mdFile, ex);
+        }
+    }
+
+}
--- a/pmd-doc/src/main/java/net/sourceforge/pmd/docs/GenerateRuleDocsCmd.java
+++ b/pmd-doc/src/main/java/net/sourceforge/pmd/docs/GenerateRuleDocsCmd.java
@@ -41,6 +41,9 @@ public final class GenerateRuleDocsCmd {
        generator.generate(registeredRuleSets, additionalRulesets);

        System.out.println("Generated docs in " + (System.currentTimeMillis() - start) + " ms");
+
+        DeadLinksChecker deadLinksChecker = new DeadLinksChecker();
+        deadLinksChecker.checkDeadLinks(output.resolve("docs/pages"));
    }

    public static List<String> findAdditionalRulesets(Path basePath) {