Properly handle multiple matches once again

This commit is contained in:
Juan Martín Sotuyo Dodero
2024-04-17 23:37:12 -03:00
parent 9d04358707
commit 0282ee3583

View File

@@ -6,15 +6,19 @@ package net.sourceforge.pmd.cpd;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
class MatchCollector { class MatchCollector {
private final List<Match> matchList = new ArrayList<>(); private final Map<Integer, List<Match>> matchTree = new TreeMap<>();
private final Map<Integer, List<Match>> matchTree = new HashMap<>();
private final Map<Integer, Set<Integer>> tokenMatchSets = new HashMap<>();
private final MatchAlgorithm ma; private final MatchAlgorithm ma;
MatchCollector(MatchAlgorithm ma) { MatchCollector(MatchAlgorithm ma) {
@@ -50,42 +54,60 @@ class MatchCollector {
} }
private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) { private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
matchTree.compute(mark1.getIndex(), (m1Index, matches) -> { /*
if (matches == null) { * Check if the match is previously know. This can happen when a snippet is duplicated more than once.
matches = new ArrayList<>(); * If A, B and C are identical snippets, MatchAlgorithm will find the matching pairs:
addNewMatch(mark1, mark2, dupes, matches); * - AB
} else { * - AC
Iterator<Match> matchIterator = matches.iterator(); * - BC
while (matchIterator.hasNext()) { * It should be reduced to a single match with 3 marks
Match m = matchIterator.next(); */
TokenEntry otherEnd = m.getSecondMark().getToken(); if (tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).contains(mark2.getIndex())) {
return;
}
// does the new match supersedes this one? List<Match> matches = matchTree.computeIfAbsent(mark1.getIndex(), ArrayList::new);
if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) { Iterator<Match> matchIterator = matches.iterator();
// this match is embedded in the previous one… ignore it. while (matchIterator.hasNext()) {
return matches; Match m = matchIterator.next();
} else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) { TokenEntry otherEnd = m.getSecondMark().getToken(); // TODO : this only works for mark1 being the key
// the new match is longer and overlaps with the old one - replace it
matchIterator.remove();
matchList.remove(m);
break;
}
}
addNewMatch(mark1, mark2, dupes, matches); // does the new match supersedes this one?
if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
// this match is embedded in the previous one… ignore it.
return;
} else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
// the new match is longer and overlaps with the old one - replace it
matchIterator.remove();
break;
} else if (dupes == m.getTokenCount()) {
// we found yet another exact match of the same snippet. Roll it together
// Add this adjacency to all combinations
m.iterator().forEachRemaining(other -> registerTokenMatch(other.getToken(), mark2));
m.addMark(mark2);
return;
} }
return matches; }
});
// this is a new match, add it
matches.add(new Match(dupes, mark1, mark2));
// add matches in both directions
registerTokenMatch(mark1, mark2);
} }
private void addNewMatch(TokenEntry mark1, TokenEntry mark2, int dupes, List<Match> matches) { private void registerTokenMatch(TokenEntry mark1, TokenEntry mark2) {
Match match = new Match(dupes, mark1, mark2); tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).add(mark2.getIndex());
matches.add(match); tokenMatchSets.computeIfAbsent(mark2.getIndex(), HashSet::new).add(mark1.getIndex());
matchList.add(match);
} }
List<Match> getMatches() { List<Match> getMatches() {
return matchList; return matchTree.values().stream().reduce(new ArrayList<>(), (acc, matches) -> {
acc.addAll(matches);
return acc;
});
} }
private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) { private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {