Required dependencies (pom.xml
) :
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>60.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
Autodetect the charset encoding of a text file or input stream then ‘remove’ (skip) Byte Order Mark (BOM) while reading based on detected charset :
File inputFile = new File("/Users/fahri/Downloads/UNKNOWN_TEXT.txt");
BOMInputStream bomInputStream = new BOMInputStream(new BufferedInputStream(new FileInputStream(inputFile)),
ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE);
System.out.println("HAS BOM : " + bomInputStream.hasBOM());
CharsetDetector detector = new CharsetDetector();
detector.setText(bomInputStream);
CharsetMatch charsetMatch = detector.detect();
System.out.println("CHARSET MATCH : " + charsetMatch.getName());
BufferedReader br = new BufferedReader(new InputStreamReader(bomInputStream, charsetMatch.getName()));
for (String line = br.readLine(); line != null; line = br.readLine()) {
System.out.println(line);
}
br.close();