Properly handle multiple matches once again

2024-04-17 23:37:12 -03:00
parent 9d04358707
commit 0282ee3583
1 changed files with 52 additions and 30 deletions
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java
@@ -6,15 +6,19 @@ package net.sourceforge.pmd.cpd;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 class MatchCollector {
-    private final List<Match> matchList = new ArrayList<>();
+    private final Map<Integer, List<Match>> matchTree = new TreeMap<>();
-    private final Map<Integer, List<Match>> matchTree = new HashMap<>();
+
    private final Map<Integer, Set<Integer>> tokenMatchSets = new HashMap<>();
    private final MatchAlgorithm ma;
    MatchCollector(MatchAlgorithm ma) {
@@ -50,42 +54,60 @@ class MatchCollector {
    }
    private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
-        matchTree.compute(mark1.getIndex(), (m1Index, matches) -> {
+        /*
-            if (matches == null) {
+         * Check if the match is previously know. This can happen when a snippet is duplicated more than once.
-                matches = new ArrayList<>();
+         * If A, B and C are identical snippets, MatchAlgorithm will find the matching pairs:
-                addNewMatch(mark1, mark2, dupes, matches);
+         *  - AB
-            } else {
+         *  - AC
-                Iterator<Match> matchIterator = matches.iterator();
+         *  - BC
-                while (matchIterator.hasNext()) {
+         * It should be reduced to a single match with 3 marks
-                    Match m = matchIterator.next();
+         */
-                    TokenEntry otherEnd = m.getSecondMark().getToken();
+        if (tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).contains(mark2.getIndex())) {
            return;
        }
-                    // does the new match supersedes this one?
+        List<Match> matches = matchTree.computeIfAbsent(mark1.getIndex(), ArrayList::new);
-                    if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
+        Iterator<Match> matchIterator = matches.iterator();
-                        // this match is embedded in the previous one… ignore it.
+        while (matchIterator.hasNext()) {
-                        return matches;
+            Match m = matchIterator.next();
-                    } else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
+            TokenEntry otherEnd = m.getSecondMark().getToken(); // TODO : this only works for mark1 being the key
                        // the new match is longer and overlaps with the old one - replace it
                        matchIterator.remove();
                        matchList.remove(m);
                        break;
                    }
                }
-                addNewMatch(mark1, mark2, dupes, matches);
+            // does the new match supersedes this one?
            if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
                // this match is embedded in the previous one… ignore it.
                return;
            } else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
                // the new match is longer and overlaps with the old one - replace it
                matchIterator.remove();
                break;
            } else if (dupes == m.getTokenCount()) {
                // we found yet another exact match of the same snippet. Roll it together
                // Add this adjacency to all combinations
                m.iterator().forEachRemaining(other -> registerTokenMatch(other.getToken(), mark2));
                m.addMark(mark2);
                return;
            }
-            return matches;
+        }
-        });
+
        // this is a new match, add it
        matches.add(new Match(dupes, mark1, mark2));
        // add matches in both directions
        registerTokenMatch(mark1, mark2);
    }
-    private void addNewMatch(TokenEntry mark1, TokenEntry mark2, int dupes, List<Match> matches) {
+    private void registerTokenMatch(TokenEntry mark1, TokenEntry mark2) {
-        Match match = new Match(dupes, mark1, mark2);
+        tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).add(mark2.getIndex());
-        matches.add(match);
+        tokenMatchSets.computeIfAbsent(mark2.getIndex(), HashSet::new).add(mark1.getIndex());
        matchList.add(match);
    }
    List<Match> getMatches() {
-        return matchList;
+        return matchTree.values().stream().reduce(new ArrayList<>(), (acc, matches) -> {
            acc.addAll(matches);
            return acc;
        });
    }
    private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {