Skip to content

Commit 2134a2a

Browse files
committed
Change DetectEncoding Api from stream to database.
This reduces the amount of unnecessary database loads. Check the system tab first as this leads to slightly better detection.
1 parent 0cbf95e commit 2134a2a

2 files changed

Lines changed: 91 additions & 94 deletions

File tree

src/lcf/reader_util.h

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#include "lcf/string_view.h"
1616

1717
namespace lcf {
18+
namespace rpg {
19+
class Database;
20+
}
1821

1922
/**
2023
* ReaderUtil namespace.
@@ -29,42 +32,42 @@ namespace ReaderUtil {
2932
std::string CodepageToEncoding(int codepage);
3033

3134
/**
32-
* Detects the encoding based on text analysis.
35+
* Detects the encoding of the database based on text analysis.
3336
*
34-
* @param filestream stream containing the database file
37+
* @param db Database to process
3538
*
3639
* @return encoding or empty string if not detected.
3740
*/
38-
std::string DetectEncoding(std::istream& filestream);
41+
std::string DetectEncoding(lcf::rpg::Database& db);
3942

4043
/**
41-
* Detects the encoding based on text analysis.
44+
* Detects the encoding of the database based on text analysis.
45+
* Returns a vector of possible candidates, highest candidate being at the beginning.
4246
*
43-
* @param filestream stream containing the database file
47+
* @param db Database to process
4448
*
45-
* @return encoding or empty string if not detected.
49+
* @return list of encodings or empty if not detected
4650
*/
47-
std::string DetectEncoding(StringView data);
51+
std::vector<std::string> DetectEncodings(lcf::rpg::Database& db);
4852

4953
/**
50-
* Detects the encoding based on text analysis and returns a vector with
51-
* possible candidates, highest candidate being at the beginning.
54+
* Detects the encoding of a string based on text analysis.
5255
*
53-
* @param filestream stream containing the database file
56+
* @param string encoded data of a few hundred bytes
5457
*
55-
* @return list of encodings or empty if not detected
58+
* @return encoding or empty string if not detected.
5659
*/
57-
std::vector<std::string> DetectEncodings(std::istream& filestream);
60+
std::string DetectEncoding(StringView data);
5861

5962
/**
60-
* Detects the encoding based on text analysis and returns a vector with
61-
* possible candidates, highest candidate being at the beginning.
63+
* Detects the encoding of a string based on text analysis.
64+
* Returns a vector of possible candidates, highest candidate being at the beginning.
6265
*
6366
* @param string encoded data of a few hundred bytes
6467
*
6568
* @return list of encodings or empty if not detected
6669
*/
67-
std::vector<std::string> DetectEncodings(StringView data);
70+
std::vector<std::string> DetectEncodings(StringView string);
6871

6972
/**
7073
* Returns the encoding set in the ini file.
@@ -73,7 +76,7 @@ namespace ReaderUtil {
7376
*
7477
* @return encoding or empty string if not found.
7578
*/
76-
std::string GetEncoding(const std::string& ini_file);
79+
std::string GetEncoding(StringView ini_file);
7780

7881
/**
7982
* Returns the encoding set in the ini file.

src/reader_util.cpp

Lines changed: 72 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ std::string ReaderUtil::CodepageToEncoding(int codepage) {
7575
return outs;
7676
}
7777

78-
std::string ReaderUtil::DetectEncoding(std::istream& filestream) {
79-
std::vector<std::string> encodings = DetectEncodings(filestream);
78+
std::string ReaderUtil::DetectEncoding(lcf::rpg::Database& db) {
79+
std::vector<std::string> encodings = DetectEncodings(db);
8080

8181
if (encodings.empty()) {
8282
return "";
@@ -85,114 +85,108 @@ std::string ReaderUtil::DetectEncoding(std::istream& filestream) {
8585
return encodings.front();
8686
}
8787

88-
std::string ReaderUtil::DetectEncoding(StringView data) {
89-
std::vector<std::string> encodings = DetectEncodings(data);
90-
91-
if (encodings.empty()) {
92-
return "";
93-
}
94-
95-
return encodings.front();
96-
}
97-
98-
std::vector<std::string> ReaderUtil::DetectEncodings(std::istream& filestream) {
88+
std::vector<std::string> ReaderUtil::DetectEncodings(lcf::rpg::Database& db) {
9989
#if LCF_SUPPORT_ICU
100-
// Populate db->terms and db->system or will empty by default even if load fails
101-
auto db = LDB_Reader::Load(filestream, "");
90+
std::ostringstream text;
10291

103-
if (!db) {
104-
return {};
92+
auto append = [](const auto& s) {
93+
return ToString(s) + " ";
94+
};
95+
96+
lcf::rpg::ForEachString(db.system, [&](const auto& val, const auto& ctx) {
97+
text << append(val);
98+
});
99+
100+
// Cannot use ForEachString here for Terms:
101+
// Too much untranslated garbage data in there, even in default database
102+
for (const auto& s: {
103+
db.terms.menu_save,
104+
db.terms.menu_quit,
105+
db.terms.new_game,
106+
db.terms.load_game,
107+
db.terms.exit_game,
108+
db.terms.status,
109+
db.terms.row,
110+
db.terms.order,
111+
db.terms.wait_on,
112+
db.terms.wait_off,
113+
db.terms.level,
114+
db.terms.health_points,
115+
db.terms.spirit_points,
116+
db.terms.normal_status,
117+
db.terms.sp_cost,
118+
db.terms.attack,
119+
db.terms.defense,
120+
db.terms.spirit,
121+
db.terms.agility,
122+
db.terms.weapon,
123+
db.terms.shield,
124+
db.terms.armor,
125+
db.terms.helmet,
126+
db.terms.accessory,
127+
db.terms.save_game_message,
128+
db.terms.load_game_message,
129+
db.terms.exit_game_message,
130+
db.terms.file,
131+
db.terms.yes,
132+
db.terms.no
133+
}) {
134+
text << append(s);
105135
}
106136

107-
std::ostringstream text;
108-
text <<
109-
db->terms.menu_save <<
110-
db->terms.menu_quit <<
111-
db->terms.new_game <<
112-
db->terms.load_game <<
113-
db->terms.exit_game <<
114-
db->terms.status <<
115-
db->terms.row <<
116-
db->terms.order <<
117-
db->terms.wait_on <<
118-
db->terms.wait_off <<
119-
db->terms.level <<
120-
db->terms.health_points <<
121-
db->terms.spirit_points <<
122-
db->terms.normal_status <<
123-
db->terms.exp_short <<
124-
db->terms.lvl_short <<
125-
db->terms.hp_short <<
126-
db->terms.sp_short <<
127-
db->terms.sp_cost <<
128-
db->terms.attack <<
129-
db->terms.defense <<
130-
db->terms.spirit <<
131-
db->terms.agility <<
132-
db->terms.weapon <<
133-
db->terms.shield <<
134-
db->terms.armor <<
135-
db->terms.helmet <<
136-
db->terms.accessory <<
137-
db->terms.save_game_message <<
138-
db->terms.load_game_message <<
139-
db->terms.file <<
140-
db->terms.exit_game_message <<
141-
db->terms.yes <<
142-
db->terms.no <<
143-
db->system.boat_name <<
144-
db->system.ship_name <<
145-
db->system.airship_name <<
146-
db->system.title_name <<
147-
db->system.gameover_name <<
148-
db->system.system_name <<
149-
db->system.system2_name <<
150-
db->system.battletest_background <<
151-
db->system.frame_name;
152-
153137
return ReaderUtil::DetectEncodings(text.str());
154138
#else
155139
return std::vector<std::string>();
156140
#endif
157141
}
158142

159-
std::vector<std::string> ReaderUtil::DetectEncodings(StringView data) {
143+
std::string ReaderUtil::DetectEncoding(StringView string) {
144+
std::vector<std::string> encodings = DetectEncodings(string);
145+
146+
if (encodings.empty()) {
147+
return "";
148+
}
149+
150+
return encodings.front();
151+
}
152+
153+
std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
160154
std::vector<std::string> encodings;
161155
#if LCF_SUPPORT_ICU
162-
if (!data.empty()) {
156+
if (!string.empty()) {
163157
UErrorCode status = U_ZERO_ERROR;
164158
UCharsetDetector* detector = ucsdet_open(&status);
165159

166-
auto s = std::string(data);
160+
auto s = std::string(string);
167161
ucsdet_setText(detector, s.c_str(), s.length(), &status);
168162

169163
int32_t matches_count;
170164
const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
171165

172-
if (matches != NULL) {
166+
if (matches != nullptr) {
173167
// Collect all candidates, most confident comes first
174168
for (int i = 0; i < matches_count; ++i) {
175169
std::string encoding = ucsdet_getName(matches[i], &status);
176170

177171
// Fixes to ensure proper Windows encodings
178172
if (encoding == "Shift_JIS") {
179-
encodings.push_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
173+
encodings.emplace_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
180174
} else if (encoding == "EUC-KR") {
181-
encodings.push_back("windows-949-2000"); // Korean with \ as backlash
175+
encodings.emplace_back("windows-949-2000"); // Korean with \ as backlash
182176
} else if (encoding == "GB18030") {
183-
encodings.push_back("windows-936-2000"); // Simplified Chinese
177+
encodings.emplace_back("windows-936-2000"); // Simplified Chinese
184178
} else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
185-
encodings.push_back("ibm-5348_P100-1997"); // Occidental with Euro
179+
encodings.emplace_back("ibm-5348_P100-1997"); // Occidental with Euro
186180
} else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
187-
encodings.push_back("ibm-5346_P100-1998"); // Central Europe with Euro
181+
encodings.emplace_back("ibm-5346_P100-1998"); // Central Europe with Euro
188182
} else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
189-
encodings.push_back("ibm-5347_P100-1998"); // Cyrillic with Euro
183+
encodings.emplace_back("ibm-5347_P100-1998"); // Cyrillic with Euro
190184
} else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
191-
encodings.push_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
185+
encodings.emplace_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
192186
} else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
193-
encodings.push_back("ibm-5349_P100-1998"); // Greek with Euro
187+
encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
194188
} else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
195-
encodings.push_back("ibm-9447_P100-2002"); // Hebrew with Euro
189+
encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
196190
} else {
197191
encodings.push_back(encoding);
198192
}
@@ -205,8 +199,8 @@ std::vector<std::string> encodings;
205199
return encodings;
206200
}
207201

208-
std::string ReaderUtil::GetEncoding(const std::string& ini_file) {
209-
INIReader ini(ini_file);
202+
std::string ReaderUtil::GetEncoding(StringView ini_file) {
203+
INIReader ini(ToString(ini_file));
210204
if (ini.ParseError() != -1) {
211205
std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
212206
if (!encoding.empty()) {

0 commit comments

Comments
 (0)