From d7acfe2c41ce97a5e39e77dedec57660f7c64ed0 Mon Sep 17 00:00:00 2001 From: LakatosMark Date: Wed, 6 May 2026 16:30:07 +0200 Subject: [PATCH 1/2] Fix #850: Resolve UTF-8 encoding and IPC issues in Search plugin Forces UTF-8 encoding in the C++ layer and rewrites the Java IOHelper to properly decode multi-byte UTF-8 characters. --- .../src/cc/search/indexer/Context.java | 4 +++- .../src/cc/search/indexer/util/IOHelper.java | 14 ++++++-------- plugins/search/indexer/src/indexerprocess.cpp | 1 + .../service/include/service/serviceprocess.h | 1 + 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java b/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java index 55558d61d..6bdaf5ee1 100644 --- a/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java +++ b/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java @@ -14,6 +14,8 @@ import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; /** * Analysis context. @@ -60,7 +62,7 @@ public Context(String fileId_, File file_, String fileMimeType_) // Read content from a file stream try (FileInputStream stream = new FileInputStream(file_)) { String fileContent = IOHelper.readFullContent( - IOHelper.getReaderForInput(stream)); + new InputStreamReader(stream, StandardCharsets.UTF_8)); // Get line informations try (Reader reader = new StringReader(fileContent)) { diff --git a/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java b/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java index 7b50e7dea..cde1f5405 100644 --- a/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java +++ b/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java @@ -57,14 +57,12 @@ public static InputStreamReader getReaderForInput(InputStream input_) throws IOE * @throws IOException */ public static String readFullContent(InputStreamReader reader_) throws IOException { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - - int b = reader_.read(); - while (b != -1) { - out.write(b); - b = reader_.read(); + StringBuilder out = new StringBuilder(); + char[] buffer = new char[4096]; + int read; + while ((read = reader_.read(buffer)) != -1) { + out.append(buffer, 0, read); } - - return out.toString(reader_.getEncoding()); + return out.toString(); } } diff --git a/plugins/search/indexer/src/indexerprocess.cpp b/plugins/search/indexer/src/indexerprocess.cpp index c2064f62d..1559720c3 100644 --- a/plugins/search/indexer/src/indexerprocess.cpp +++ b/plugins/search/indexer/src/indexerprocess.cpp @@ -59,6 +59,7 @@ IndexerProcess::IndexerProcess( std::vector execArguments { "java", JAVAMEMORYAMOUNT, + "-Dfile.encoding=UTF-8", "-classpath", classpath.c_str(), "-Djava.util.logging.config.class=cc.search.common.config.LogConfigurator", "-Djava.util.logging.SimpleFormatter.format=%1$tY-%1$tm-%1$td %1$tT [%4$s] %5$s%6$s%n", diff --git a/plugins/search/service/include/service/serviceprocess.h b/plugins/search/service/include/service/serviceprocess.h index 319788aba..6bf5451b9 100644 --- a/plugins/search/service/include/service/serviceprocess.h +++ b/plugins/search/service/include/service/serviceprocess.h @@ -61,6 +61,7 @@ class ServiceProcess : public SearchServiceIf, public util::PipedProcess std::string classpath = compassRoot_ + "/lib/java/*"; ::execlp("java", "java", "-server", + "-Dfile.encoding=UTF-8", "-classpath", classpath.c_str(), //"-Xdebug", "-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8666", "-Djava.util.logging.config.class=cc.search.common.config.LogConfigurator", From 3e0daba0d9642c4c0ae51d8b229d186baaeabff5 Mon Sep 17 00:00:00 2001 From: LakatosMark Date: Mon, 25 May 2026 19:15:59 +0200 Subject: [PATCH 2/2] Fix UTF-8 fallback and maintain BOM auto-detection in IOHelper --- .../indexer/indexer-java/src/cc/search/indexer/Context.java | 4 +--- .../indexer-java/src/cc/search/indexer/util/IOHelper.java | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java b/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java index 6bdaf5ee1..55558d61d 100644 --- a/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java +++ b/plugins/search/indexer/indexer-java/src/cc/search/indexer/Context.java @@ -14,8 +14,6 @@ import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; /** * Analysis context. @@ -62,7 +60,7 @@ public Context(String fileId_, File file_, String fileMimeType_) // Read content from a file stream try (FileInputStream stream = new FileInputStream(file_)) { String fileContent = IOHelper.readFullContent( - new InputStreamReader(stream, StandardCharsets.UTF_8)); + IOHelper.getReaderForInput(stream)); // Get line informations try (Reader reader = new StringReader(fileContent)) { diff --git a/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java b/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java index cde1f5405..085cc1fc1 100644 --- a/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java +++ b/plugins/search/indexer/indexer-java/src/cc/search/indexer/util/IOHelper.java @@ -44,7 +44,7 @@ public static InputStreamReader getReaderForInput(InputStream input_) throws IOE if (charset == null) { in.reset(); - charset = Charset.defaultCharset().name(); + charset = "UTF-8"; } return new InputStreamReader(in, charset);