diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java index 19e4fb2eb6..ba78977ce7 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java @@ -15,7 +15,9 @@ import java.util.List; import net.sourceforge.pmd.PMD; +import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.BOMInputStream; public class SourceCode { @@ -68,7 +70,18 @@ public class SourceCode { @Override public Reader getReader() throws Exception { - return new InputStreamReader(new FileInputStream(file), encoding); + BOMInputStream inputStream = + new BOMInputStream(new FileInputStream(file), + ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE); + + if (inputStream.hasBOM()) { + encoding = inputStream.getBOMCharsetName(); + } + return new InputStreamReader(inputStream, encoding); + } + + public String getEncoding() { + return encoding; } @Override diff --git a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java index a3758ccc90..62cbe12785 100644 --- a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java +++ b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java @@ -5,14 +5,18 @@ package net.sourceforge.pmd.cpd; import static org.junit.Assert.assertEquals; +import java.io.File; import java.util.ArrayList; import net.sourceforge.pmd.PMD; +import net.sourceforge.pmd.cpd.SourceCode.FileCodeLoader; import org.junit.Test; public class SourceCodeTest { - + private static final String BASE_RESOURCE_PATH = + "src/test/resources/net/sourceforge/pmd/cpd/files/"; + private static final String SAMPLE_CODE = "Line 1\n" + "Line 2\n" + @@ -36,4 +40,24 @@ public class SourceCodeTest { assertEquals("Line 2", sourceCode.getSlice(2, 2)); assertEquals("Line 1" + PMD.EOL + "Line 2", sourceCode.getSlice(1, 2)); } + + @Test + public void testEncodingDetectionFromBOM() throws Exception { + FileCodeLoader loader = + new SourceCode.FileCodeLoader(new File(BASE_RESOURCE_PATH + "file_with_utf8_bom.java"), "ISO-8859-1"); + + //The encoding detection is done when the reader is created + loader.getReader(); + assertEquals("UTF-8", loader.getEncoding()); + } + + @Test + public void testEncodingIsNotChangedWhenThereIsNoBOM() throws Exception { + FileCodeLoader loader = + new SourceCode.FileCodeLoader(new File(BASE_RESOURCE_PATH + "file_with_ISO-8859-1_encoding.java"), "ISO-8859-1"); + + //The encoding detection is done when the reader is created + loader.getReader(); + assertEquals("ISO-8859-1", loader.getEncoding()); + } } diff --git a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java new file mode 100644 index 0000000000..d7f62ea9ed --- /dev/null +++ b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java @@ -0,0 +1,8 @@ +/** + * This file is using ISO-8859-1 (Latin-1) encoding. + * + * ä + */ +public class FileWith_ISO8859-1_Encoding { + +} diff --git a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java new file mode 100644 index 0000000000..566bf55d83 --- /dev/null +++ b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java @@ -0,0 +1,8 @@ +/** + * This file is using UTF-8 with BOM encoding. + * + * ä + */ +public class FileWith_UTF-8-BOM_Encoding { + +} diff --git a/src/site/markdown/overview/changelog.md b/src/site/markdown/overview/changelog.md index 0c9779dead..7aa77e58bf 100644 --- a/src/site/markdown/overview/changelog.md +++ b/src/site/markdown/overview/changelog.md @@ -18,6 +18,7 @@ * [#27](https://github.com/adangel/pmd/pull/27): Added support for Raw String Literals (C++11). * [#29](https://github.com/adangel/pmd/pull/29): Added support for files with UTF-8 BOM to JSP tokenizer. * [#30](https://github.com/adangel/pmd/pull/30): Removed file filter for files that are explicitly specified on the CPD command line using the '--files' command line option. +* [#31](https://github.com/adangel/pmd/pull/31): Added file encoding detection to CPD. * [#79](https://github.com/pmd/pmd/pull/79): do not flag public static void main(String[]) as UseVarargs; ignore @Override for UseVarargs * [#80](https://github.com/pmd/pmd/pull/80): Update mvn-plugin.md * [#83](https://github.com/pmd/pmd/pull/83): Adds new Code Climate-compliant JSON renderer