From 607606534d46b62d1d0e0e017e0c19e39df905a9 Mon Sep 17 00:00:00 2001
From: Jan van Nunen <jan.van.nunen@tiobe.com>
Date: Mon, 11 Jan 2016 11:37:14 +0100
Subject: [PATCH 1/2] Added file encoding detection to CPD.

---
 .../net/sourceforge/pmd/cpd/SourceCode.java   | 15 ++++++++++-
 .../sourceforge/pmd/cpd/SourceCodeTest.java   | 26 ++++++++++++++++++-
 .../files/file_with_ISO-8859-1_encoding.java  |  0
 .../pmd/cpd/files/file_with_utf8_bom.java     |  1 +
 4 files changed, 40 insertions(+), 2 deletions(-)
 create mode 100644 pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java
 create mode 100644 pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java

diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java
index 19e4fb2eb6..ba78977ce7 100644
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/SourceCode.java
@@ -15,7 +15,9 @@ import java.util.List;
 
 import net.sourceforge.pmd.PMD;
 
+import org.apache.commons.io.ByteOrderMark;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.BOMInputStream;
 
 public class SourceCode {
 
@@ -68,7 +70,18 @@ public class SourceCode {
 
         @Override
         public Reader getReader() throws Exception {
-            return new InputStreamReader(new FileInputStream(file), encoding);
+            BOMInputStream inputStream = 
+                new BOMInputStream(new FileInputStream(file),
+                        ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE);
+
+            if (inputStream.hasBOM()) {
+               encoding = inputStream.getBOMCharsetName();
+            }
+            return new InputStreamReader(inputStream, encoding);
+        }
+
+        public String getEncoding() {
+            return encoding;
         }
 
         @Override
diff --git a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java
index a3758ccc90..62cbe12785 100644
--- a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java
+++ b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/SourceCodeTest.java
@@ -5,14 +5,18 @@ package net.sourceforge.pmd.cpd;
 
 import static org.junit.Assert.assertEquals;
 
+import java.io.File;
 import java.util.ArrayList;
 
 import net.sourceforge.pmd.PMD;
+import net.sourceforge.pmd.cpd.SourceCode.FileCodeLoader;
 
 import org.junit.Test;
 
 public class SourceCodeTest {
-
+    private static final String BASE_RESOURCE_PATH = 
+            "src/test/resources/net/sourceforge/pmd/cpd/files/";
+	
     private static final String SAMPLE_CODE =
             "Line 1\n" +
             "Line 2\n" +
@@ -36,4 +40,24 @@ public class SourceCodeTest {
         assertEquals("Line 2", sourceCode.getSlice(2, 2));
         assertEquals("Line 1" + PMD.EOL + "Line 2", sourceCode.getSlice(1, 2));
     }
+
+    @Test
+    public void testEncodingDetectionFromBOM() throws Exception {
+         FileCodeLoader loader = 
+                new SourceCode.FileCodeLoader(new File(BASE_RESOURCE_PATH + "file_with_utf8_bom.java"), "ISO-8859-1");
+
+         //The encoding detection is done when the reader is created
+         loader.getReader();
+         assertEquals("UTF-8", loader.getEncoding());
+    }
+
+    @Test
+    public void testEncodingIsNotChangedWhenThereIsNoBOM() throws Exception {
+        FileCodeLoader loader = 
+                new SourceCode.FileCodeLoader(new File(BASE_RESOURCE_PATH + "file_with_ISO-8859-1_encoding.java"), "ISO-8859-1");
+
+        //The encoding detection is done when the reader is created
+        loader.getReader();
+        assertEquals("ISO-8859-1", loader.getEncoding());
+   }
 }
diff --git a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java
new file mode 100644
index 0000000000..5f282702bb
--- /dev/null
+++ b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java
@@ -0,0 +1 @@
+
\ No newline at end of file

From 04527bc4cb9c004ff3daf145690a1268b04a2a70 Mon Sep 17 00:00:00 2001
From: Andreas Dangel <adangel@users.sourceforge.net>
Date: Thu, 21 Jan 2016 20:23:02 +0100
Subject: [PATCH 2/2] Update changelog

---
 .../pmd/cpd/files/file_with_ISO-8859-1_encoding.java     | 8 ++++++++
 .../sourceforge/pmd/cpd/files/file_with_utf8_bom.java    | 9 ++++++++-
 src/site/markdown/overview/changelog.md                  | 1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java
index e69de29bb2..d7f62ea9ed 100644
--- a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java
+++ b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_ISO-8859-1_encoding.java
@@ -0,0 +1,8 @@
+/**
+ * This file is using ISO-8859-1 (Latin-1) encoding.
+ *
+ * �
+ */
+public class FileWith_ISO8859-1_Encoding {
+
+}
diff --git a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java
index 5f282702bb..566bf55d83 100644
--- a/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java
+++ b/pmd-core/src/test/resources/net/sourceforge/pmd/cpd/files/file_with_utf8_bom.java
@@ -1 +1,8 @@
-
\ No newline at end of file
+/**
+ * This file is using UTF-8 with BOM encoding.
+ *
+ * ä
+ */
+public class FileWith_UTF-8-BOM_Encoding {
+
+}
diff --git a/src/site/markdown/overview/changelog.md b/src/site/markdown/overview/changelog.md
index 0c9779dead..7aa77e58bf 100644
--- a/src/site/markdown/overview/changelog.md
+++ b/src/site/markdown/overview/changelog.md
@@ -18,6 +18,7 @@
 *   [#27](https://github.com/adangel/pmd/pull/27): Added support for Raw String Literals (C++11).
 *   [#29](https://github.com/adangel/pmd/pull/29): Added support for files with UTF-8 BOM to JSP tokenizer.
 *   [#30](https://github.com/adangel/pmd/pull/30): Removed file filter for files that are explicitly specified on the CPD command line using the '--files' command line option.
+*   [#31](https://github.com/adangel/pmd/pull/31): Added file encoding detection to CPD.
 *   [#79](https://github.com/pmd/pmd/pull/79): do not flag public static void main(String[]) as UseVarargs; ignore @Override for UseVarargs
 *   [#80](https://github.com/pmd/pmd/pull/80): Update mvn-plugin.md
 *   [#83](https://github.com/pmd/pmd/pull/83): Adds new Code Climate-compliant JSON renderer