Skip to content

Commit 0b3b83a

Browse files
allensu05FrankYFTang
authored andcommitted
ICU-22100 Improve Japanese phrase breaking performance
See #2287
1 parent 76df897 commit 0b3b83a

File tree

1 file changed

+45
-87
lines changed

1 file changed

+45
-87
lines changed

icu4c/source/common/mlbe.cpp

+45-87
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,6 @@ MlBreakEngine::~MlBreakEngine() {}
3434

3535
namespace {
3636
const char16_t INVALID = u'|';
37-
const int32_t MAX_FEATURE = 13;
38-
const int32_t MAX_FEATURE_LENGTH = 11;
39-
40-
void concatChar(const char16_t *str, const UChar32 *arr, int32_t length, char16_t *feature, UErrorCode &status) {
41-
if (U_FAILURE(status)) {
42-
return;
43-
}
44-
UnicodeString result(str);
45-
for (int i = 0; i < length; i++) {
46-
result.append(arr[i]);
47-
}
48-
U_ASSERT(result.length() < MAX_FEATURE_LENGTH);
49-
result.extract(feature, MAX_FEATURE_LENGTH, status); // NUL-terminates
50-
}
5137
}
5238

5339
int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t rangeEnd,
@@ -144,96 +130,68 @@ int32_t MlBreakEngine::divideUpRange(UText *inText, int32_t rangeStart, int32_t
144130

145131
void MlBreakEngine::evaluateBreakpoint(UChar32* elementList, int32_t index, int32_t &numBreaks,
146132
UVector32 &boundary, UErrorCode &status) const {
147-
char16_t featureList[MAX_FEATURE][MAX_FEATURE_LENGTH];
148133
if (U_FAILURE(status)) {
149134
return;
150135
}
151136

152-
UChar32 arr[4] = {-1, -1, -1, -1};
153-
int32_t length = 0, listLength = 0;
154-
155-
const UChar32 w1 = elementList[0];
156-
const UChar32 w2 = elementList[1];
157-
const UChar32 w3 = elementList[2];
158-
const UChar32 w4 = elementList[3];
159-
const UChar32 w5 = elementList[4];
160-
const UChar32 w6 = elementList[5];
137+
UnicodeString feature;
138+
int32_t score = fNegativeSum;
161139

162-
length = 1;
163-
if (w1 != INVALID) {
164-
arr[0] = w1;
165-
concatChar(u"UW1:", arr, length, featureList[listLength++], status);
140+
if (elementList[0] != INVALID) {
141+
// When the key doesn't exist, Hashtable.geti(key) returns 0 and 2 * 0 = 0.
142+
// So, we can skip to check whether fModel includes key featureList[j] or not.
143+
score += (2 * fModel.geti(feature.setTo(u"UW1:", 4).append(elementList[0])));
166144
}
167-
if (w2 != INVALID) {
168-
arr[0] = w2;
169-
concatChar(u"UW2:", arr, length, featureList[listLength++], status);
145+
if (elementList[1] != INVALID) {
146+
score += (2 * fModel.geti(feature.setTo(u"UW2:", 4).append(elementList[1])));
170147
}
171-
if (w3 != INVALID) {
172-
arr[0] = w3;
173-
concatChar(u"UW3:", arr, length, featureList[listLength++], status);
148+
if (elementList[2] != INVALID) {
149+
score += (2 * fModel.geti(feature.setTo(u"UW3:", 4).append(elementList[2])));
174150
}
175-
if (w4 != INVALID) {
176-
arr[0] = w4;
177-
concatChar(u"UW4:", arr, length, featureList[listLength++], status);
151+
if (elementList[3] != INVALID) {
152+
score += (2 * fModel.geti(feature.setTo(u"UW4:", 4).append(elementList[3])));
178153
}
179-
if (w5 != INVALID) {
180-
arr[0] = w5;
181-
concatChar(u"UW5:", arr, length, featureList[listLength++], status);
154+
if (elementList[4] != INVALID) {
155+
score += (2 * fModel.geti(feature.setTo(u"UW5:", 4).append(elementList[4])));
182156
}
183-
if (w6 != INVALID) {
184-
arr[0] = w6;
185-
concatChar(u"UW6:", arr, length, featureList[listLength++], status);
157+
if (elementList[5] != INVALID) {
158+
score += (2 * fModel.geti(feature.setTo(u"UW6:", 4).append(elementList[5])));
186159
}
187-
length = 2;
188-
if (w2 != INVALID && w3 != INVALID) {
189-
arr[0] = w2;
190-
arr[1] = w3;
191-
concatChar(u"BW1:", arr, length, featureList[listLength++], status);
160+
if (elementList[1] != INVALID && elementList[2] != INVALID) {
161+
score += (2 * fModel.geti(
162+
feature.setTo(u"BW1:", 4).append(elementList[1]).append(elementList[2])));
192163
}
193-
if (w3 != INVALID && w4 != INVALID) {
194-
arr[0] = w3;
195-
arr[1] = w4;
196-
concatChar(u"BW2:", arr, length, featureList[listLength++], status);
164+
if (elementList[2] != INVALID && elementList[3] != INVALID) {
165+
score += (2 * fModel.geti(
166+
feature.setTo(u"BW2:", 4).append(elementList[2]).append(elementList[3])));
197167
}
198-
if (w4 != INVALID && w5 != INVALID) {
199-
arr[0] = w4;
200-
arr[1] = w5;
201-
concatChar(u"BW3:", arr, length, featureList[listLength++], status);
168+
if (elementList[3] != INVALID && elementList[4] != INVALID) {
169+
score += (2 * fModel.geti(
170+
feature.setTo(u"BW3:", 4).append(elementList[3]).append(elementList[4])));
202171
}
203-
length = 3;
204-
if (w1 != INVALID && w2 != INVALID && w3 != INVALID) {
205-
arr[0] = w1;
206-
arr[1] = w2;
207-
arr[2] = w3;
208-
concatChar(u"TW1:", arr, length, featureList[listLength++], status);
172+
if (elementList[0] != INVALID && elementList[1] != INVALID && elementList[2] != INVALID) {
173+
score += (2 * fModel.geti(feature.setTo(u"TW1:", 4)
174+
.append(elementList[0])
175+
.append(elementList[1])
176+
.append(elementList[2])));
209177
}
210-
if (w2 != INVALID && w3 != INVALID && w4 != INVALID) {
211-
arr[0] = w2;
212-
arr[1] = w3;
213-
arr[2] = w4;
214-
concatChar(u"TW2:", arr, length, featureList[listLength++], status);
178+
if (elementList[1] != INVALID && elementList[2] != INVALID && elementList[3] != INVALID) {
179+
score += (2 * fModel.geti(feature.setTo(u"TW2:", 4)
180+
.append(elementList[1])
181+
.append(elementList[2])
182+
.append(elementList[3])));
215183
}
216-
if (w3 != INVALID && w4 != INVALID && w5 != INVALID) {
217-
arr[0] = w3;
218-
arr[1] = w4;
219-
arr[2] = w5;
220-
concatChar(u"TW3:", arr, length, featureList[listLength++], status);
184+
if (elementList[2] != INVALID && elementList[3] != INVALID && elementList[4] != INVALID) {
185+
score += (2 * fModel.geti(feature.setTo(u"TW3:", 4)
186+
.append(elementList[2])
187+
.append(elementList[3])
188+
.append(elementList[4])));
221189
}
222-
if (w4 != INVALID && w5 != INVALID && w6 != INVALID) {
223-
arr[0] = w4;
224-
arr[1] = w5;
225-
arr[2] = w6;
226-
concatChar(u"TW4:", arr, length, featureList[listLength++], status);
227-
}
228-
if (U_FAILURE(status)) {
229-
return;
230-
}
231-
int32_t score = fNegativeSum;
232-
for (int32_t j = 0; j < listLength; j++) {
233-
UnicodeString key(featureList[j]);
234-
if (fModel.containsKey(key)) {
235-
score += (2 * fModel.geti(key));
236-
}
190+
if (elementList[3] != INVALID && elementList[4] != INVALID && elementList[5] != INVALID) {
191+
score += (2 * fModel.geti(feature.setTo(u"TW4:", 4)
192+
.append(elementList[3])
193+
.append(elementList[4])
194+
.append(elementList[5])));
237195
}
238196
if (score > 0) {
239197
boundary.addElement(index, status);

0 commit comments

Comments
 (0)