From 0282ee35839848ddbf4bd18c1bf0728e9346628e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Mart=C3=ADn=20Sotuyo=20Dodero?= Date: Wed, 17 Apr 2024 23:37:12 -0300 Subject: [PATCH] Properly handle multiple matches once again --- .../sourceforge/pmd/cpd/MatchCollector.java | 82 ++++++++++++------- 1 file changed, 52 insertions(+), 30 deletions(-) diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java index a9c3ea1d22..120e84f77f 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/MatchCollector.java @@ -6,15 +6,19 @@ package net.sourceforge.pmd.cpd; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeMap; class MatchCollector { - private final List matchList = new ArrayList<>(); - private final Map> matchTree = new HashMap<>(); + private final Map> matchTree = new TreeMap<>(); + + private final Map> tokenMatchSets = new HashMap<>(); + private final MatchAlgorithm ma; MatchCollector(MatchAlgorithm ma) { @@ -50,42 +54,60 @@ class MatchCollector { } private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) { - matchTree.compute(mark1.getIndex(), (m1Index, matches) -> { - if (matches == null) { - matches = new ArrayList<>(); - addNewMatch(mark1, mark2, dupes, matches); - } else { - Iterator matchIterator = matches.iterator(); - while (matchIterator.hasNext()) { - Match m = matchIterator.next(); - TokenEntry otherEnd = m.getSecondMark().getToken(); + /* + * Check if the match is previously know. This can happen when a snippet is duplicated more than once. + * If A, B and C are identical snippets, MatchAlgorithm will find the matching pairs: + * - AB + * - AC + * - BC + * It should be reduced to a single match with 3 marks + */ + if (tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).contains(mark2.getIndex())) { + return; + } - // does the new match supersedes this one? - if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) { - // this match is embedded in the previous one… ignore it. - return matches; - } else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) { - // the new match is longer and overlaps with the old one - replace it - matchIterator.remove(); - matchList.remove(m); - break; - } - } + List matches = matchTree.computeIfAbsent(mark1.getIndex(), ArrayList::new); + Iterator matchIterator = matches.iterator(); + while (matchIterator.hasNext()) { + Match m = matchIterator.next(); + TokenEntry otherEnd = m.getSecondMark().getToken(); // TODO : this only works for mark1 being the key - addNewMatch(mark1, mark2, dupes, matches); + // does the new match supersedes this one? + if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) { + // this match is embedded in the previous one… ignore it. + return; + } else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) { + // the new match is longer and overlaps with the old one - replace it + matchIterator.remove(); + break; + } else if (dupes == m.getTokenCount()) { + // we found yet another exact match of the same snippet. Roll it together + + // Add this adjacency to all combinations + m.iterator().forEachRemaining(other -> registerTokenMatch(other.getToken(), mark2)); + + m.addMark(mark2); + return; } - return matches; - }); + } + + // this is a new match, add it + matches.add(new Match(dupes, mark1, mark2)); + + // add matches in both directions + registerTokenMatch(mark1, mark2); } - private void addNewMatch(TokenEntry mark1, TokenEntry mark2, int dupes, List matches) { - Match match = new Match(dupes, mark1, mark2); - matches.add(match); - matchList.add(match); + private void registerTokenMatch(TokenEntry mark1, TokenEntry mark2) { + tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).add(mark2.getIndex()); + tokenMatchSets.computeIfAbsent(mark2.getIndex(), HashSet::new).add(mark1.getIndex()); } List getMatches() { - return matchList; + return matchTree.values().stream().reduce(new ArrayList<>(), (acc, matches) -> { + acc.addAll(matches); + return acc; + }); } private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {