Properly handle multiple matches once again

This commit is contained in:
Juan Martín Sotuyo Dodero
2024-04-17 23:37:12 -03:00
parent 9d04358707
commit 0282ee3583

@ -6,15 +6,19 @@ package net.sourceforge.pmd.cpd;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
class MatchCollector {
private final List<Match> matchList = new ArrayList<>();
private final Map<Integer, List<Match>> matchTree = new HashMap<>();
private final Map<Integer, List<Match>> matchTree = new TreeMap<>();
private final Map<Integer, Set<Integer>> tokenMatchSets = new HashMap<>();
private final MatchAlgorithm ma;
MatchCollector(MatchAlgorithm ma) {
@ -50,42 +54,60 @@ class MatchCollector {
}
private void reportMatch(TokenEntry mark1, TokenEntry mark2, int dupes) {
matchTree.compute(mark1.getIndex(), (m1Index, matches) -> {
if (matches == null) {
matches = new ArrayList<>();
addNewMatch(mark1, mark2, dupes, matches);
} else {
Iterator<Match> matchIterator = matches.iterator();
while (matchIterator.hasNext()) {
Match m = matchIterator.next();
TokenEntry otherEnd = m.getSecondMark().getToken();
/*
* Check if the match is previously know. This can happen when a snippet is duplicated more than once.
* If A, B and C are identical snippets, MatchAlgorithm will find the matching pairs:
* - AB
* - AC
* - BC
* It should be reduced to a single match with 3 marks
*/
if (tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).contains(mark2.getIndex())) {
return;
}
// does the new match supersedes this one?
if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
// this match is embedded in the previous one ignore it.
return matches;
} else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
// the new match is longer and overlaps with the old one - replace it
matchIterator.remove();
matchList.remove(m);
break;
}
}
List<Match> matches = matchTree.computeIfAbsent(mark1.getIndex(), ArrayList::new);
Iterator<Match> matchIterator = matches.iterator();
while (matchIterator.hasNext()) {
Match m = matchIterator.next();
TokenEntry otherEnd = m.getSecondMark().getToken(); // TODO : this only works for mark1 being the key
addNewMatch(mark1, mark2, dupes, matches);
// does the new match supersedes this one?
if (otherEnd.getIndex() < mark2.getIndex() && otherEnd.getIndex() + m.getTokenCount() >= mark2.getIndex() + dupes) {
// this match is embedded in the previous one ignore it.
return;
} else if (mark2.getIndex() < otherEnd.getIndex() && mark2.getIndex() + dupes >= otherEnd.getIndex() + m.getTokenCount()) {
// the new match is longer and overlaps with the old one - replace it
matchIterator.remove();
break;
} else if (dupes == m.getTokenCount()) {
// we found yet another exact match of the same snippet. Roll it together
// Add this adjacency to all combinations
m.iterator().forEachRemaining(other -> registerTokenMatch(other.getToken(), mark2));
m.addMark(mark2);
return;
}
return matches;
});
}
// this is a new match, add it
matches.add(new Match(dupes, mark1, mark2));
// add matches in both directions
registerTokenMatch(mark1, mark2);
}
private void addNewMatch(TokenEntry mark1, TokenEntry mark2, int dupes, List<Match> matches) {
Match match = new Match(dupes, mark1, mark2);
matches.add(match);
matchList.add(match);
private void registerTokenMatch(TokenEntry mark1, TokenEntry mark2) {
tokenMatchSets.computeIfAbsent(mark1.getIndex(), HashSet::new).add(mark2.getIndex());
tokenMatchSets.computeIfAbsent(mark2.getIndex(), HashSet::new).add(mark1.getIndex());
}
List<Match> getMatches() {
return matchList;
return matchTree.values().stream().reduce(new ArrayList<>(), (acc, matches) -> {
acc.addAll(matches);
return acc;
});
}
private boolean hasPreviousDupe(TokenEntry mark1, TokenEntry mark2) {