Ensure CPD matches are nonoverlapping maximals

- The old implementation would fail on scenarios where duplicates exceed the minimum token window. In general, if we have 20 identical tokens, 1 different, and then the same 20 again, CPD would find the 20 at the beginning match the last 20… but also the 19 at the beginning match the last 19, the 18 at the beggining… down to the windows size
2024-04-17 20:10:40 -03:00 · 2024-04-17 20:10:40 -03:00 · f9cb7ab992
commit f9cb7ab992
parent 2f54938793
1 changed files with 25 additions and 18 deletions
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java
@ -5,6 +5,8 @@
 package net.sourceforge.pmd.cpd;

 import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
@ -12,7 +14,7 @@ import java.util.TreeMap;
 class MatchCollector {

    private final List<Match> matchList = new ArrayList<>();
-    private final Map<Integer, Map<Integer, Match>> matchTree = new TreeMap<>();
+    private final Map<Integer, List<Match>> matchTree = new HashMap<>();
    private final MatchAlgorithm ma;

    MatchCollector(MatchAlgorithm ma) {
@ -38,7 +40,7 @@ class MatchCollector {
                if (dupes < ma.getMinimumTileSize()) {
                    continue;
                }
-                // is it still too close together
+                // both blocks overlap
                if (diff + dupes >= 1) {
                    continue;
                }
@ -48,32 +50,37 @@ class MatchCollector {
    }

    private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
-        matchTree.compute(dupes, (dupCount, matches) -> {
+        matchTree.compute(mark1.getIndex(), (m1Index, matches) -> {
            if (matches == null) {
-                matches = new TreeMap<>();
-                addNewMatch(mark1, mark2, dupCount, matches);
+                matches = new ArrayList<>();
+                addNewMatch(mark1, mark2, dupes, matches);
            } else {
-                Match matchA = matches.get(mark1.getIndex());
-                Match matchB = matches.get(mark2.getIndex());
+                Iterator<Match> matchIterator = matches.iterator();
+                while (matchIterator.hasNext()) {
+                    Match m = matchIterator.next();
+                    TokenEntry otherEnd = m.getSecondMark().getToken();

-                if (matchA == null && matchB == null) {
-                    addNewMatch(mark1, mark2, dupes, matches);
-                } else if (matchA == null) {
-                    matchB.addMark(mark1);
-                    matches.put(mark1.getIndex(), matchB);
-                } else if (matchB == null) {
-                    matchA.addMark(mark2);
-                    matches.put(mark2.getIndex(), matchA);
+                    // does the new match supersedes this one?
+                    if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
+                        // this match is embedded in the previous one… ignore it.
+                        return matches;
+                    } else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
+                        // the new match is longer and overlaps with the old one - replace it
+                        matchIterator.remove();
+                        matchList.remove(m);
+                        break;
+                    }
                }
+
+                addNewMatch(mark1, mark2, dupes, matches);
            }
            return matches;
        });
    }

-    private void addNewMatch(TokenEntry mark1, TokenEntry mark2, int dupes, Map<Integer, Match> matches) {
+    private void addNewMatch(TokenEntry mark1, TokenEntry mark2, int dupes, List<Match> matches) {
        Match match = new Match(dupes, mark1, mark2);
-        matches.put(mark1.getIndex(), match);
-        matches.put(mark2.getIndex(), match);
+        matches.add(match);
        matchList.add(match);
    }