Properly handle multiple matches once again

2024-04-17 23:37:12 -03:00
parent 9d04358707
commit 0282ee3583
1 changed files with 52 additions and 30 deletions
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java
@ -6,15 +6,19 @@ package net.sourceforge.pmd.cpd;

 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;

 class MatchCollector {

-    private final List<Match> matchList = new ArrayList<>();
-    private final Map<Integer, List<Match>> matchTree = new HashMap<>();
+    private final Map<Integer, List<Match>> matchTree = new TreeMap<>();
+
+    private final Map<Integer, Set<Integer>> tokenMatchSets = new HashMap<>();
+
    private final MatchAlgorithm ma;

    MatchCollector(MatchAlgorithm ma) {
@ -50,42 +54,60 @@ class MatchCollector {
    }

    private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
-        matchTree.compute(mark1.getIndex(), (m1Index, matches) -> {
-            if (matches == null) {
-                matches = new ArrayList<>();
-                addNewMatch(mark1, mark2, dupes, matches);
-            } else {
-                Iterator<Match> matchIterator = matches.iterator();
-                while (matchIterator.hasNext()) {
-                    Match m = matchIterator.next();
-                    TokenEntry otherEnd = m.getSecondMark().getToken();
+        /*
+         * Check if the match is previously know. This can happen when a snippet is duplicated more than once.
+         * If A, B and C are identical snippets, MatchAlgorithm will find the matching pairs:
+         *  - AB
+         *  - AC
+         *  - BC
+         * It should be reduced to a single match with 3 marks
+         */
+        if (tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).contains(mark2.getIndex())) {
+            return;
+        }

-                    // does the new match supersedes this one?
-                    if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
-                        // this match is embedded in the previous one… ignore it.
-                        return matches;
-                    } else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
-                        // the new match is longer and overlaps with the old one - replace it
-                        matchIterator.remove();
-                        matchList.remove(m);
-                        break;
-                    }
-                }
+        List<Match> matches = matchTree.computeIfAbsent(mark1.getIndex(), ArrayList::new);
+        Iterator<Match> matchIterator = matches.iterator();
+        while (matchIterator.hasNext()) {
+            Match m = matchIterator.next();
+            TokenEntry otherEnd = m.getSecondMark().getToken(); // TODO : this only works for mark1 being the key

-                addNewMatch(mark1, mark2, dupes, matches);
+            // does the new match supersedes this one?
+            if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
+                // this match is embedded in the previous one… ignore it.
+                return;
+            } else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
+                // the new match is longer and overlaps with the old one - replace it
+                matchIterator.remove();
+                break;
+            } else if (dupes == m.getTokenCount()) {
+                // we found yet another exact match of the same snippet. Roll it together
+
+                // Add this adjacency to all combinations
+                m.iterator().forEachRemaining(other -> registerTokenMatch(other.getToken(), mark2));
+
+                m.addMark(mark2);
+                return;
            }
-            return matches;
-        });
+        }
+
+        // this is a new match, add it
+        matches.add(new Match(dupes, mark1, mark2));
+
+        // add matches in both directions
+        registerTokenMatch(mark1, mark2);
    }

-    private void addNewMatch(TokenEntry mark1, TokenEntry mark2, int dupes, List<Match> matches) {
-        Match match = new Match(dupes, mark1, mark2);
-        matches.add(match);
-        matchList.add(match);
+    private void registerTokenMatch(TokenEntry mark1, TokenEntry mark2) {
+        tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).add(mark2.getIndex());
+        tokenMatchSets.computeIfAbsent(mark2.getIndex(), HashSet::new).add(mark1.getIndex());
    }

    List<Match> getMatches() {
-        return matchList;
+        return matchTree.values().stream().reduce(new ArrayList<>(), (acc, matches) -> {
+            acc.addAll(matches);
+            return acc;
+        });
    }

    private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {