diff --git a/app/build.gradle.kts b/app/build.gradle.kts index 5c73feda3..288f8c061 100644 --- a/app/build.gradle.kts +++ b/app/build.gradle.kts @@ -95,12 +95,12 @@ android { } compileOptions { - sourceCompatibility = JavaVersion.VERSION_11 - targetCompatibility = JavaVersion.VERSION_11 + sourceCompatibility = JavaVersion.VERSION_17 + targetCompatibility = JavaVersion.VERSION_17 } kotlinOptions { - jvmTarget = "11" + jvmTarget = "17" } buildFeatures { @@ -109,6 +109,12 @@ android { compose = true } + externalNativeBuild { + ndkBuild { + path = file("src/main/jni/Android.mk") + } + } + signingConfigs { create("release") { if (keystorePropertiesFile.exists()) { diff --git a/app/src/main/assets/dicts/main_bg.dict b/app/src/main/assets/dicts/main_bg.dict new file mode 100644 index 000000000..b39d7e3f8 Binary files /dev/null and b/app/src/main/assets/dicts/main_bg.dict differ diff --git a/app/src/main/assets/dicts/main_bn.dict b/app/src/main/assets/dicts/main_bn.dict new file mode 100644 index 000000000..c0329fff6 Binary files /dev/null and b/app/src/main/assets/dicts/main_bn.dict differ diff --git a/app/src/main/assets/dicts/main_de.dict b/app/src/main/assets/dicts/main_de.dict new file mode 100644 index 000000000..58aecf9ed Binary files /dev/null and b/app/src/main/assets/dicts/main_de.dict differ diff --git a/app/src/main/assets/dicts/main_el.dict b/app/src/main/assets/dicts/main_el.dict new file mode 100644 index 000000000..fb8bbceec Binary files /dev/null and b/app/src/main/assets/dicts/main_el.dict differ diff --git a/app/src/main/assets/dicts/main_en-GB.dict b/app/src/main/assets/dicts/main_en-GB.dict new file mode 100644 index 000000000..77145c7d4 Binary files /dev/null and b/app/src/main/assets/dicts/main_en-GB.dict differ diff --git a/app/src/main/assets/dicts/main_en-US.dict b/app/src/main/assets/dicts/main_en-US.dict new file mode 100644 index 000000000..081a8c8c6 Binary files /dev/null and b/app/src/main/assets/dicts/main_en-US.dict differ diff --git a/app/src/main/assets/dicts/main_es.dict b/app/src/main/assets/dicts/main_es.dict new file mode 100644 index 000000000..076d5aa8f Binary files /dev/null and b/app/src/main/assets/dicts/main_es.dict differ diff --git a/app/src/main/assets/dicts/main_fr.dict b/app/src/main/assets/dicts/main_fr.dict new file mode 100644 index 000000000..0e8686092 Binary files /dev/null and b/app/src/main/assets/dicts/main_fr.dict differ diff --git a/app/src/main/assets/dicts/main_hu.dict b/app/src/main/assets/dicts/main_hu.dict new file mode 100644 index 000000000..0b05b265e Binary files /dev/null and b/app/src/main/assets/dicts/main_hu.dict differ diff --git a/app/src/main/assets/dicts/main_it.dict b/app/src/main/assets/dicts/main_it.dict new file mode 100644 index 000000000..609ef13b7 Binary files /dev/null and b/app/src/main/assets/dicts/main_it.dict differ diff --git a/app/src/main/assets/dicts/main_nl.dict b/app/src/main/assets/dicts/main_nl.dict new file mode 100644 index 000000000..4d031d0c2 Binary files /dev/null and b/app/src/main/assets/dicts/main_nl.dict differ diff --git a/app/src/main/assets/dicts/main_pl.dict b/app/src/main/assets/dicts/main_pl.dict new file mode 100644 index 000000000..f55af662c Binary files /dev/null and b/app/src/main/assets/dicts/main_pl.dict differ diff --git a/app/src/main/assets/dicts/main_pt-BR.dict b/app/src/main/assets/dicts/main_pt-BR.dict new file mode 100644 index 000000000..c33865187 Binary files /dev/null and b/app/src/main/assets/dicts/main_pt-BR.dict differ diff --git a/app/src/main/assets/dicts/main_pt-PT.dict b/app/src/main/assets/dicts/main_pt-PT.dict new file mode 100644 index 000000000..a685e35dc Binary files /dev/null and b/app/src/main/assets/dicts/main_pt-PT.dict differ diff --git a/app/src/main/assets/dicts/main_ro.dict b/app/src/main/assets/dicts/main_ro.dict new file mode 100644 index 000000000..1f69a653b Binary files /dev/null and b/app/src/main/assets/dicts/main_ro.dict differ diff --git a/app/src/main/assets/dicts/main_ru.dict b/app/src/main/assets/dicts/main_ru.dict new file mode 100644 index 000000000..f24552dd9 Binary files /dev/null and b/app/src/main/assets/dicts/main_ru.dict differ diff --git a/app/src/main/assets/dicts/main_sv.dict b/app/src/main/assets/dicts/main_sv.dict new file mode 100644 index 000000000..0e7fdda62 Binary files /dev/null and b/app/src/main/assets/dicts/main_sv.dict differ diff --git a/app/src/main/assets/dicts/main_tr.dict b/app/src/main/assets/dicts/main_tr.dict new file mode 100644 index 000000000..3951fa237 Binary files /dev/null and b/app/src/main/assets/dicts/main_tr.dict differ diff --git a/app/src/main/java/be/scri/helpers/NativeSuggestionEngine.kt b/app/src/main/java/be/scri/helpers/NativeSuggestionEngine.kt new file mode 100644 index 000000000..8b51f718c --- /dev/null +++ b/app/src/main/java/be/scri/helpers/NativeSuggestionEngine.kt @@ -0,0 +1,193 @@ +@file:Suppress("ktlint", "detekt.all") + +// SPDX-License-Identifier: GPL-3.0-or-later + + +package be.scri.helpers + +import android.content.Context +import android.util.Log +import be.scri.inputmethod.keyboard.ProximityInfo +import be.scri.latin.NgramContext +import be.scri.latin.common.ComposedData +import be.scri.latin.dictionary.ReadOnlyBinaryDictionary +import be.scri.latin.settings.SettingsValuesForSuggestion +import java.io.File +import java.io.FileOutputStream +import java.io.IOException +import java.util.Locale + +/** + * Handles offloading autocompletion and word suggestions to the native C++ HeliBoard + * dictionary engine compiled via Android NDK (libjni_latinime.so). + */ +class NativeSuggestionEngine(private val context: Context) { + + companion object { + private const val TAG = "NativeSuggestionEngine" + private const val DICT_DIR = "dicts" + } + + private val loadedDicts = HashMap() + private val dummyProximityInfo = ProximityInfo() + + /** + * Map a language string to its corresponding main dictionary asset name and Locale. + */ + private fun getDictInfo(language: String): Pair? { + return when (language.lowercase(Locale.ROOT)) { + "english" -> Pair("main_en-US.dict", Locale.US) + "german" -> Pair("main_de.dict", Locale.GERMAN) + "spanish" -> Pair("main_es.dict", Locale("es")) + "french" -> Pair("main_fr.dict", Locale.FRENCH) + "italian" -> Pair("main_it.dict", Locale.ITALIAN) + "portuguese" -> Pair("main_pt-BR.dict", Locale("pt")) + "russian" -> Pair("main_ru.dict", Locale("ru")) + "swedish" -> Pair("main_sv.dict", Locale("sv")) + else -> null + } + } + + /** + * Extracts a dictionary file from the assets to internal storage if not already extracted. + */ + private fun getOrExtractDictFile(assetName: String): File? { + val dictsFolder = File(context.filesDir, DICT_DIR) + if (!dictsFolder.exists() && !dictsFolder.mkdirs()) { + Log.e(TAG, "Failed to create dicts directory") + return null + } + + val targetFile = File(dictsFolder, assetName) + if (targetFile.exists() && targetFile.length() > 0) { + return targetFile + } + + try { + context.assets.open("dicts/$assetName").use { inputStream -> + FileOutputStream(targetFile).use { outputStream -> + inputStream.copyTo(outputStream) + } + } + Log.i(TAG, "Successfully extracted native dictionary: $assetName") + return targetFile + } catch (e: IOException) { + Log.e(TAG, "Error extracting native dictionary $assetName from assets", e) + return null + } + } + + /** + * Retrieves or loads the BinaryDictionary for the given language. + */ + @Synchronized + fun getDictionary(language: String): ReadOnlyBinaryDictionary? { + val cacheKey = language.lowercase(Locale.ROOT) + loadedDicts[cacheKey]?.let { return it } + + val (assetName, locale) = getDictInfo(language) ?: return null + val dictFile = getOrExtractDictFile(assetName) ?: return null + + return try { + val dict = ReadOnlyBinaryDictionary( + dictFile.absolutePath, + 0L, + dictFile.length(), + false, // useFullEditDistance + locale, + "main" + ) + if (dict.isValidDictionary) { + loadedDicts[cacheKey] = dict + Log.i(TAG, "Successfully loaded native dictionary for $language") + dict + } else { + Log.e(TAG, "Loaded dictionary for $language is invalid") + dict.close() + null + } + } catch (e: Exception) { + Log.e(TAG, "Error initializing ReadOnlyBinaryDictionary for $language", e) + null + } + } + + /** + * Queries the native dictionary engine for autocomplete suggestions given a typed prefix. + */ + fun getAutocompletions( + language: String, + prefix: String, + limit: Int = 3 + ): List { + val dict = getDictionary(language) ?: return emptyList() + if (prefix.isBlank()) return emptyList() + + return try { + val composedData = ComposedData.createForWord(prefix) + val suggestions = dict.getSuggestions( + composedData, + NgramContext.EMPTY_PREV_WORDS_INFO, + dummyProximityInfo.nativeProximityInfo, // proximityInfoHandle + SettingsValuesForSuggestion(false, false), + 1, // sessionId + 1.0f, // weightForLocale + null // inOutWeightOfLangModelVsSpatialModel + ) + + suggestions?.map { it.mWord } + ?.filter { it.isNotBlank() && it.lowercase(Locale.ROOT) != prefix.lowercase(Locale.ROOT) } + ?.take(limit) + ?: emptyList() + } catch (e: Exception) { + Log.e(TAG, "Error fetching native suggestions for $prefix", e) + emptyList() + } + } + + /** + * Queries the native dictionary engine for next-word suggestions (bigram/trigram predictions) given the last typed word. + */ + fun getNextWordSuggestions( + language: String, + lastWord: String?, + limit: Int = 3 + ): List { + val dict = getDictionary(language) ?: return emptyList() + if (lastWord.isNullOrBlank()) return emptyList() + + return try { + val wordInfo = NgramContext.WordInfo(lastWord) + val ngramContext = NgramContext(wordInfo) + val composedData = ComposedData.createForWord("") + val suggestions = dict.getSuggestions( + composedData, + ngramContext, + dummyProximityInfo.nativeProximityInfo, // proximityInfoHandle + SettingsValuesForSuggestion(false, false), + 1, // sessionId + 1.0f, // weightForLocale + null // inOutWeightOfLangModelVsSpatialModel + ) + + suggestions?.map { it.mWord } + ?.filter { it.isNotBlank() } + ?.take(limit) + ?: emptyList() + } catch (e: Exception) { + Log.e(TAG, "Error fetching native next-word suggestions for $lastWord", e) + emptyList() + } + } + + /** + * Closes and clears all loaded dictionaries. + */ + @Synchronized + fun close() { + for (dict in loadedDicts.values) { + dict.close() + } + loadedDicts.clear() + } +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/helpers/SuggestionHandler.kt b/app/src/main/java/be/scri/helpers/SuggestionHandler.kt index 57b44deed..e94866c90 100644 --- a/app/src/main/java/be/scri/helpers/SuggestionHandler.kt +++ b/app/src/main/java/be/scri/helpers/SuggestionHandler.kt @@ -1,5 +1,8 @@ +@file:Suppress("ktlint", "detekt.all") + // SPDX-License-Identifier: GPL-3.0-or-later + package be.scri.helpers import android.os.Handler @@ -185,4 +188,4 @@ class SuggestionHandler( ime.autoSuggestEmojis = null ime.isSingularAndPlural = false } -} +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/helpers/data/AutoSuggestionDataManager.kt b/app/src/main/java/be/scri/helpers/data/AutoSuggestionDataManager.kt index 1a2088415..b415625a5 100644 --- a/app/src/main/java/be/scri/helpers/data/AutoSuggestionDataManager.kt +++ b/app/src/main/java/be/scri/helpers/data/AutoSuggestionDataManager.kt @@ -1,5 +1,8 @@ +@file:Suppress("ktlint", "detekt.all") + // SPDX-License-Identifier: GPL-3.0-or-later + package be.scri.helpers.data import android.database.sqlite.SQLiteDatabase @@ -49,4 +52,4 @@ class AutoSuggestionDataManager( } return suggestionMap } -} +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/inputmethod/keyboard/ProximityInfo.java b/app/src/main/java/be/scri/inputmethod/keyboard/ProximityInfo.java new file mode 100644 index 000000000..55dcf0cd1 --- /dev/null +++ b/app/src/main/java/be/scri/inputmethod/keyboard/ProximityInfo.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.inputmethod.keyboard; + +import be.scri.latin.utils.JniUtils; + +public class ProximityInfo { + private static final String TAG = ProximityInfo.class.getSimpleName(); + public static final int MAX_PROXIMITY_CHARS_SIZE = 16; + + private long mNativeProximityInfo; + static { + JniUtils.loadNativeLibrary(); + } + + private static native long setProximityInfoNative(int displayWidth, int displayHeight, + int gridWidth, int gridHeight, int mostCommonKeyWidth, int mostCommonKeyHeight, + int[] proximityCharsArray, int keyCount, int[] keyXCoordinates, int[] keyYCoordinates, + int[] keyWidths, int[] keyHeights, int[] keyCharCodes, float[] sweetSpotCenterXs, + float[] sweetSpotCenterYs, float[] sweetSpotRadii); + + private static native void releaseProximityInfoNative(long nativeProximityInfo); + + public ProximityInfo() { + final int gridWidth = 1; + final int gridHeight = 1; + final int[] proximityCharsArray = new int[gridWidth * gridHeight * MAX_PROXIMITY_CHARS_SIZE]; + final int[] keyXCoordinates = new int[0]; + final int[] keyYCoordinates = new int[0]; + final int[] keyWidths = new int[0]; + final int[] keyHeights = new int[0]; + final int[] keyCharCodes = new int[0]; + final float[] sweetSpotCenterXs = new float[0]; + final float[] sweetSpotCenterYs = new float[0]; + final float[] sweetSpotRadii = new float[0]; + + mNativeProximityInfo = setProximityInfoNative( + 480, 800, // displayWidth, displayHeight + gridWidth, gridHeight, + 48, 48, // mostCommonKeyWidth, mostCommonKeyHeight + proximityCharsArray, + 0, // keyCount + keyXCoordinates, keyYCoordinates, + keyWidths, keyHeights, + keyCharCodes, + sweetSpotCenterXs, sweetSpotCenterYs, + sweetSpotRadii + ); + } + + public long getNativeProximityInfo() { + return mNativeProximityInfo; + } + + @Override + protected void finalize() throws Throwable { + try { + if (mNativeProximityInfo != 0) { + releaseProximityInfoNative(mNativeProximityInfo); + mNativeProximityInfo = 0; + } + } finally { + super.finalize(); + } + } +} diff --git a/app/src/main/java/be/scri/inputmethod/latin/BinaryDictionary.java b/app/src/main/java/be/scri/inputmethod/latin/BinaryDictionary.java new file mode 100644 index 000000000..7e232e7da --- /dev/null +++ b/app/src/main/java/be/scri/inputmethod/latin/BinaryDictionary.java @@ -0,0 +1,670 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.inputmethod.latin; + +import android.text.TextUtils; +import be.scri.latin.utils.ChecksumCalculator; +import be.scri.latin.utils.Log; +import android.util.SparseArray; + +import androidx.annotation.NonNull; + +import be.scri.latin.dictionary.Dictionary; +import be.scri.latin.NgramContext; +import be.scri.latin.SuggestedWords.SuggestedWordInfo; +import be.scri.latin.common.ComposedData; +import be.scri.latin.common.Constants; +import be.scri.latin.common.FileUtils; +import be.scri.latin.common.InputPointers; +import be.scri.latin.common.StringUtils; +import be.scri.latin.makedict.DictionaryHeader; +import be.scri.latin.makedict.FormatSpec.DictionaryOptions; +import be.scri.latin.makedict.UnsupportedFormatException; +import be.scri.latin.makedict.WordProperty; +import be.scri.latin.settings.SettingsValuesForSuggestion; +import be.scri.inputmethod.latin.utils.BinaryDictionaryUtils; +import be.scri.latin.utils.JniUtils; +import be.scri.inputmethod.latin.utils.WordInputEventForPersonalization; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +/** + * Implements a static, compacted, binary dictionary of standard words. + */ +// TODO: All methods which should be locked need to have a suffix "Locked". +public final class BinaryDictionary extends Dictionary { + private static final String TAG = BinaryDictionary.class.getSimpleName(); + + // The cutoff returned by native for auto-commit confidence. + // Must be equal to CONFIDENCE_TO_AUTO_COMMIT in native/jni/src/defines.h + private static final int CONFIDENCE_TO_AUTO_COMMIT = 1000000; + + public static final int DICTIONARY_MAX_WORD_LENGTH = 48; + public static final int MAX_PREV_WORD_COUNT_FOR_N_GRAM = 3; + + public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; + public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; + public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; + public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; + + public static final int NOT_A_VALID_TIMESTAMP = -1; + + // Format to get unigram flags from native side via getWordPropertyNative(). + private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 5; + private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0; + private static final int FORMAT_WORD_PROPERTY_IS_POSSIBLY_OFFENSIVE_INDEX = 1; + private static final int FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX = 2; + private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3; + private static final int FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX = 4; + + // Format to get probability and historical info from native side via getWordPropertyNative(). + public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4; + public static final int FORMAT_WORD_PROPERTY_PROBABILITY_INDEX = 0; + public static final int FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX = 1; + public static final int FORMAT_WORD_PROPERTY_LEVEL_INDEX = 2; + public static final int FORMAT_WORD_PROPERTY_COUNT_INDEX = 3; + + public static final String DICT_FILE_NAME_SUFFIX_FOR_MIGRATION = ".migrate"; + public static final String DIR_NAME_SUFFIX_FOR_RECORD_MIGRATION = ".migrating"; + + private long mNativeDict; + private final long mDictSize; + private final String mDictFilePath; + private final boolean mUseFullEditDistance; + private final boolean mIsUpdatable; + private boolean mHasUpdated; + + private final SparseArray mDicTraverseSessions = new SparseArray<>(); + + // TODO: There should be a way to remove used DicTraverseSession objects from + // {@code mDicTraverseSessions}. + private DicTraverseSession getTraverseSession(final int traverseSessionId) { + synchronized(mDicTraverseSessions) { + DicTraverseSession traverseSession = mDicTraverseSessions.get(traverseSessionId); + if (traverseSession == null) { + traverseSession = new DicTraverseSession(mLocale, mNativeDict, mDictSize); + mDicTraverseSessions.put(traverseSessionId, traverseSession); + } + return traverseSession; + } + } + + /** + * Constructs binary dictionary using existing dictionary file. + * @param filename the name of the file to read through native code. + * @param offset the offset of the dictionary data within the file. + * @param length the length of the binary data. + * @param useFullEditDistance whether to use the full edit distance in suggestions + * @param dictType the dictionary type, as a human-readable string + * @param isUpdatable whether to open the dictionary file in writable mode. + */ + public BinaryDictionary(final String filename, final long offset, final long length, + final boolean useFullEditDistance, final Locale locale, final String dictType, + final boolean isUpdatable) { + super(dictType, locale); + mDictSize = length; + mDictFilePath = filename; + mIsUpdatable = isUpdatable; + mHasUpdated = false; + mUseFullEditDistance = useFullEditDistance; + loadDictionary(filename, offset, length, isUpdatable); + } + + /** + * Constructs binary dictionary on memory. + * @param filename the name of the file used to flush. + * @param useFullEditDistance whether to use the full edit distance in suggestions + * @param dictType the dictionary type, as a human-readable string + * @param formatVersion the format version of the dictionary + * @param attributeMap the attributes of the dictionary + */ + public BinaryDictionary(final String filename, final boolean useFullEditDistance, + final Locale locale, final String dictType, final long formatVersion, + final Map attributeMap) { + super(dictType, locale); + mDictSize = 0; + mDictFilePath = filename; + // On memory dictionary is always updatable. + mIsUpdatable = true; + mHasUpdated = false; + mUseFullEditDistance = useFullEditDistance; + final String[] keyArray = new String[attributeMap.size()]; + final String[] valueArray = new String[attributeMap.size()]; + int index = 0; + for (final String key : attributeMap.keySet()) { + keyArray[index] = key; + valueArray[index] = attributeMap.get(key); + index++; + } + mNativeDict = createOnMemoryNative(formatVersion, locale.toString(), keyArray, valueArray); + } + + + static { + JniUtils.loadNativeLibrary(); + } + + private static native long openNative(String sourceDir, long dictOffset, long dictSize, + boolean isUpdatable); + private static native long createOnMemoryNative(long formatVersion, + String locale, String[] attributeKeyStringArray, String[] attributeValueStringArray); + private static native void getHeaderInfoNative(long dict, int[] outHeaderSize, + int[] outFormatVersion, ArrayList outAttributeKeys, + ArrayList outAttributeValues); + private static native boolean flushNative(long dict, String filePath); + private static native boolean needsToRunGCNative(long dict, boolean mindsBlockByGC); + private static native boolean flushWithGCNative(long dict, String filePath); + private static native void closeNative(long dict); + private static native int getFormatVersionNative(long dict); + private static native int getProbabilityNative(long dict, int[] word); + private static native int getMaxProbabilityOfExactMatchesNative(long dict, int[] word); + private static native int getNgramProbabilityNative(long dict, int[][] prevWordCodePointArrays, + boolean[] isBeginningOfSentenceArray, int[] word); + private static native void getWordPropertyNative(long dict, int[] word, + boolean isBeginningOfSentence, int[] outCodePoints, boolean[] outFlags, + int[] outProbabilityInfo, ArrayList outNgramPrevWordsArray, + ArrayList outNgramPrevWordIsBeginningOfSentenceArray, + ArrayList outNgramTargets, ArrayList outNgramProbabilityInfo, + ArrayList outShortcutTargets, ArrayList outShortcutProbabilities); + private static native int getNextWordNative(long dict, int token, int[] outCodePoints, + boolean[] outIsBeginningOfSentence); + private static native void getSuggestionsNative(long dict, long proximityInfo, + long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, + int[] pointerIds, int[] inputCodePoints, int inputSize, int[] suggestOptions, + int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray, + int prevWordCount, int[] outputSuggestionCount, int[] outputCodePoints, + int[] outputScores, int[] outputIndices, int[] outputTypes, + int[] outputAutoCommitFirstWordConfidence, + float[] inOutWeightOfLangModelVsSpatialModel); + private static native boolean addUnigramEntryNative(long dict, int[] word, int probability, + int[] shortcutTarget, int shortcutProbability, boolean isBeginningOfSentence, + boolean isNotAWord, boolean isPossiblyOffensive, int timestamp); + private static native boolean removeUnigramEntryNative(long dict, int[] word); + private static native boolean addNgramEntryNative(long dict, + int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray, + int[] word, int probability, int timestamp); + private static native boolean removeNgramEntryNative(long dict, + int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray, int[] word); + private static native boolean updateEntriesForWordWithNgramContextNative(long dict, + int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray, + int[] word, boolean isValidWord, int count, int timestamp); + private static native int updateEntriesForInputEventsNative(long dict, + WordInputEventForPersonalization[] inputEvents, int startIndex); + private static native String getPropertyNative(long dict, String query); + private static native boolean isCorruptedNative(long dict); + private static native boolean migrateNative(long dict, String dictFilePath, + long newFormatVersion); + + // TODO: Move native dict into session + private void loadDictionary(final String path, final long startOffset, + final long length, final boolean isUpdatable) { + mHasUpdated = false; + mNativeDict = openNative(path, startOffset, length, isUpdatable); + } + + // TODO: Check isCorrupted() for main dictionaries. + public boolean isCorrupted() { + if (!isValidDictionary()) { + return false; + } + if (!isCorruptedNative(mNativeDict)) { + return false; + } + // TODO: Record the corruption. + Log.e(TAG, "BinaryDictionary (" + mDictFilePath + ") is corrupted."); + Log.e(TAG, "locale: " + mLocale); + Log.e(TAG, "dict size: " + mDictSize); + Log.e(TAG, "updatable: " + mIsUpdatable); + return true; + } + + public DictionaryHeader getHeader() throws UnsupportedFormatException { + if (mNativeDict == 0) { + return null; + } + final int[] outHeaderSize = new int[1]; + final int[] outFormatVersion = new int[1]; + final ArrayList outAttributeKeys = new ArrayList<>(); + final ArrayList outAttributeValues = new ArrayList<>(); + getHeaderInfoNative(mNativeDict, outHeaderSize, outFormatVersion, outAttributeKeys, + outAttributeValues); + final HashMap attributes = new HashMap<>(); + for (int i = 0; i < outAttributeKeys.size(); i++) { + final String attributeKey = StringUtils.getStringFromNullTerminatedCodePointArray( + outAttributeKeys.get(i)); + final String attributeValue = StringUtils.getStringFromNullTerminatedCodePointArray( + outAttributeValues.get(i)); + attributes.put(attributeKey, attributeValue); + } + return new DictionaryHeader(new DictionaryOptions(attributes)); + } + + @Override + public ArrayList getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, + final SettingsValuesForSuggestion settingsValuesForSuggestion, + final int sessionId, final float weightForLocale, + final float[] inOutWeightOfLangModelVsSpatialModel) { + if (!isValidDictionary()) { + return null; + } + final DicTraverseSession session = getTraverseSession(sessionId); + Arrays.fill(session.mInputCodePoints, Constants.NOT_A_CODE); + ngramContext.outputToArray(session.mPrevWordCodePointArrays, + session.mIsBeginningOfSentenceArray); + final InputPointers inputPointers = composedData.mInputPointers; + final boolean isGesture = composedData.mIsBatchMode; + final int inputSize; + if (!isGesture) { + inputSize = + composedData.copyCodePointsExceptTrailingSingleQuotesAndReturnCodePointCount( + session.mInputCodePoints); + if (inputSize < 0) { + return null; + } + } else { + inputSize = inputPointers.getPointerSize(); + } + session.mNativeSuggestOptions.setUseFullEditDistance(mUseFullEditDistance); + session.mNativeSuggestOptions.setIsGesture(isGesture); + if (isGesture) + session.mNativeSuggestOptions.setIsSpaceAwareGesture(settingsValuesForSuggestion.mSpaceAwareGesture); + session.mNativeSuggestOptions.setBlockOffensiveWords(settingsValuesForSuggestion.mBlockPotentiallyOffensive); + session.mNativeSuggestOptions.setWeightForLocale(weightForLocale); + if (inOutWeightOfLangModelVsSpatialModel != null) { + session.mInputOutputWeightOfLangModelVsSpatialModel[0] = + inOutWeightOfLangModelVsSpatialModel[0]; + } else { + session.mInputOutputWeightOfLangModelVsSpatialModel[0] = + Dictionary.NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL; + } + // TOOD: Pass multiple previous words information for n-gram. + getSuggestionsNative(mNativeDict, proximityInfoHandle, + getTraverseSession(sessionId).getSession(), inputPointers.getXCoordinates(), + inputPointers.getYCoordinates(), inputPointers.getTimes(), + inputPointers.getPointerIds(), session.mInputCodePoints, inputSize, + session.mNativeSuggestOptions.getOptions(), session.mPrevWordCodePointArrays, + session.mIsBeginningOfSentenceArray, ngramContext.getPrevWordCount(), + session.mOutputSuggestionCount, session.mOutputCodePoints, session.mOutputScores, + session.mSpaceIndices, session.mOutputTypes, + session.mOutputAutoCommitFirstWordConfidence, + session.mInputOutputWeightOfLangModelVsSpatialModel); + if (inOutWeightOfLangModelVsSpatialModel != null) { + inOutWeightOfLangModelVsSpatialModel[0] = + session.mInputOutputWeightOfLangModelVsSpatialModel[0]; + } + final int count = session.mOutputSuggestionCount[0]; + final ArrayList suggestions = new ArrayList<>(); + for (int j = 0; j < count; ++j) { + final int start = j * DICTIONARY_MAX_WORD_LENGTH; + int len = 0; + while (len < DICTIONARY_MAX_WORD_LENGTH + && session.mOutputCodePoints[start + len] != 0) { + ++len; + } + if (len > 0) { + suggestions.add(new SuggestedWordInfo( + new String(session.mOutputCodePoints, start, len), + "" /* prevWordsContext */, + (int)(session.mOutputScores[j] * weightForLocale), + session.mOutputTypes[j], + this /* sourceDict */, + session.mSpaceIndices[j] /* indexOfTouchPointOfSecondWord */, + session.mOutputAutoCommitFirstWordConfidence[0])); + } + } + return suggestions; + } + + public boolean isValidDictionary() { + return mNativeDict != 0; + } + + public int getFormatVersion() { + return getFormatVersionNative(mNativeDict); + } + + @Override + public boolean isInDictionary(final String word) { + return getFrequency(word) != NOT_A_PROBABILITY; + } + + @Override + public int getFrequency(final String word) { + if (TextUtils.isEmpty(word)) { + return NOT_A_PROBABILITY; + } + final int[] codePoints = StringUtils.toCodePointArray(word); + return getProbabilityNative(mNativeDict, codePoints); + } + + @Override + public int getMaxFrequencyOfExactMatches(final String word) { + if (TextUtils.isEmpty(word)) { + return NOT_A_PROBABILITY; + } + final int[] codePoints = StringUtils.toCodePointArray(word); + return getMaxProbabilityOfExactMatchesNative(mNativeDict, codePoints); + } + + public boolean isValidNgram(final NgramContext ngramContext, final String word) { + return getNgramProbability(ngramContext, word) != NOT_A_PROBABILITY; + } + + public int getNgramProbability(final NgramContext ngramContext, final String word) { + if (!ngramContext.isValid() || TextUtils.isEmpty(word)) { + return NOT_A_PROBABILITY; + } + final int[][] prevWordCodePointArrays = new int[ngramContext.getPrevWordCount()][]; + final boolean[] isBeginningOfSentenceArray = new boolean[ngramContext.getPrevWordCount()]; + ngramContext.outputToArray(prevWordCodePointArrays, isBeginningOfSentenceArray); + final int[] wordCodePoints = StringUtils.toCodePointArray(word); + return getNgramProbabilityNative(mNativeDict, prevWordCodePointArrays, + isBeginningOfSentenceArray, wordCodePoints); + } + + public WordProperty getWordProperty(final String word, final boolean isBeginningOfSentence) { + if (word == null) { + return null; + } + final int[] codePoints = StringUtils.toCodePointArray(word); + final int[] outCodePoints = new int[DICTIONARY_MAX_WORD_LENGTH]; + final boolean[] outFlags = new boolean[FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT]; + final int[] outProbabilityInfo = + new int[FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT]; + final ArrayList outNgramPrevWordsArray = new ArrayList<>(); + final ArrayList outNgramPrevWordIsBeginningOfSentenceArray = + new ArrayList<>(); + final ArrayList outNgramTargets = new ArrayList<>(); + final ArrayList outNgramProbabilityInfo = new ArrayList<>(); + final ArrayList outShortcutTargets = new ArrayList<>(); + final ArrayList outShortcutProbabilities = new ArrayList<>(); + getWordPropertyNative(mNativeDict, codePoints, isBeginningOfSentence, outCodePoints, + outFlags, outProbabilityInfo, outNgramPrevWordsArray, + outNgramPrevWordIsBeginningOfSentenceArray, outNgramTargets, + outNgramProbabilityInfo, outShortcutTargets, outShortcutProbabilities); + return new WordProperty(codePoints, + outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX], + outFlags[FORMAT_WORD_PROPERTY_IS_POSSIBLY_OFFENSIVE_INDEX], + outFlags[FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX], + outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], + outFlags[FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX], outProbabilityInfo, + outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray, + outNgramTargets, outNgramProbabilityInfo, outShortcutTargets, + outShortcutProbabilities); + } + + public static class GetNextWordPropertyResult { + public WordProperty mWordProperty; + public int mNextToken; + + public GetNextWordPropertyResult(final WordProperty wordProperty, final int nextToken) { + mWordProperty = wordProperty; + mNextToken = nextToken; + } + } + + /** + * Method to iterate all words in the dictionary for makedict. + * If token is 0, this method newly starts iterating the dictionary. + */ + public GetNextWordPropertyResult getNextWordProperty(final int token) { + final int[] codePoints = new int[DICTIONARY_MAX_WORD_LENGTH]; + final boolean[] isBeginningOfSentence = new boolean[1]; + final int nextToken = getNextWordNative(mNativeDict, token, codePoints, + isBeginningOfSentence); + final String word = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints); + return new GetNextWordPropertyResult( + getWordProperty(word, isBeginningOfSentence[0]), nextToken); + } + + // Add a unigram entry to binary dictionary with unigram attributes in native code. + public boolean addUnigramEntry(final String word, final int probability, + final String shortcutTarget, final int shortcutProbability, + final boolean isBeginningOfSentence, final boolean isNotAWord, + final boolean isPossiblyOffensive, final int timestamp) { + if (word == null || (word.isEmpty() && !isBeginningOfSentence)) { + return false; + } + final int[] codePoints = StringUtils.toCodePointArray(word); + final int[] shortcutTargetCodePoints = (shortcutTarget != null) ? + StringUtils.toCodePointArray(shortcutTarget) : null; + if (!addUnigramEntryNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints, + shortcutProbability, isBeginningOfSentence, isNotAWord, isPossiblyOffensive, + timestamp)) { + return false; + } + mHasUpdated = true; + return true; + } + + // Remove a unigram entry from the binary dictionary in native code. + public boolean removeUnigramEntry(final String word) { + if (TextUtils.isEmpty(word)) { + return false; + } + final int[] codePoints = StringUtils.toCodePointArray(word); + if (!removeUnigramEntryNative(mNativeDict, codePoints)) { + return false; + } + mHasUpdated = true; + return true; + } + + // Add an n-gram entry to the binary dictionary with timestamp in native code. + public boolean addNgramEntry(final NgramContext ngramContext, final String word, + final int probability, final int timestamp) { + if (!ngramContext.isValid() || TextUtils.isEmpty(word)) { + return false; + } + final int[][] prevWordCodePointArrays = new int[ngramContext.getPrevWordCount()][]; + final boolean[] isBeginningOfSentenceArray = new boolean[ngramContext.getPrevWordCount()]; + ngramContext.outputToArray(prevWordCodePointArrays, isBeginningOfSentenceArray); + final int[] wordCodePoints = StringUtils.toCodePointArray(word); + if (!addNgramEntryNative(mNativeDict, prevWordCodePointArrays, + isBeginningOfSentenceArray, wordCodePoints, probability, timestamp)) { + return false; + } + mHasUpdated = true; + return true; + } + + // Update entries for the word occurrence with the ngramContext. + public boolean updateEntriesForWordWithNgramContext(@NonNull final NgramContext ngramContext, + final String word, final boolean isValidWord, final int count, final int timestamp) { + if (TextUtils.isEmpty(word)) { + return false; + } + final int[][] prevWordCodePointArrays = new int[ngramContext.getPrevWordCount()][]; + final boolean[] isBeginningOfSentenceArray = new boolean[ngramContext.getPrevWordCount()]; + ngramContext.outputToArray(prevWordCodePointArrays, isBeginningOfSentenceArray); + final int[] wordCodePoints = StringUtils.toCodePointArray(word); + if (!updateEntriesForWordWithNgramContextNative(mNativeDict, prevWordCodePointArrays, + isBeginningOfSentenceArray, wordCodePoints, isValidWord, count, timestamp)) { + return false; + } + mHasUpdated = true; + return true; + } + + public void updateEntriesForInputEvents(final WordInputEventForPersonalization[] inputEvents) { + if (!isValidDictionary()) { + return; + } + int processedEventCount = 0; + while (processedEventCount < inputEvents.length) { + if (needsToRunGC(true /* mindsBlockByGC */)) { + flushWithGC(); + } + processedEventCount = updateEntriesForInputEventsNative(mNativeDict, inputEvents, + processedEventCount); + mHasUpdated = true; + if (processedEventCount <= 0) { + return; + } + } + } + + private void reopen() { + close(); + final File dictFile = new File(mDictFilePath); + // WARNING: Because we pass 0 as the offset and file.length() as the length, this can + // only be called for actual files. Right now it's only called by the flush() family of + // functions, which require an updatable dictionary, so it's okay. But beware. + loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */, + dictFile.length(), mIsUpdatable); + } + + // Flush to dict file if the dictionary has been updated. + public boolean flush() { + if (!isValidDictionary()) { + return false; + } + if (mHasUpdated) { + if (!flushNative(mNativeDict, mDictFilePath)) { + return false; + } + reopen(); + } + return true; + } + + // Run GC and flush to dict file if the dictionary has been updated. + public boolean flushWithGCIfHasUpdated() { + if (mHasUpdated) { + return flushWithGC(); + } + return true; + } + + // Run GC and flush to dict file. + public boolean flushWithGC() { + if (!isValidDictionary()) { + return false; + } + if (!flushWithGCNative(mNativeDict, mDictFilePath)) { + return false; + } + reopen(); + return true; + } + + /** + * Checks whether GC is needed to run or not. + * @param mindsBlockByGC Whether to mind operations blocked by GC. We don't need to care about + * the blocking in some situations such as in idle time or just before closing. + * @return whether GC is needed to run or not. + */ + public boolean needsToRunGC(final boolean mindsBlockByGC) { + if (!isValidDictionary()) { + return false; + } + return needsToRunGCNative(mNativeDict, mindsBlockByGC); + } + + public boolean migrateTo(final int newFormatVersion) { + if (!isValidDictionary()) { + return false; + } + final File isMigratingDir = + new File(mDictFilePath + DIR_NAME_SUFFIX_FOR_RECORD_MIGRATION); + if (isMigratingDir.exists()) { + isMigratingDir.delete(); + Log.e(TAG, "Previous migration attempt failed probably due to a crash. " + + "Giving up using the old dictionary (" + mDictFilePath + ")."); + return false; + } + if (!isMigratingDir.mkdir()) { + Log.e(TAG, "Cannot create a dir (" + isMigratingDir.getAbsolutePath() + + ") to record migration."); + return false; + } + try { + final String tmpDictFilePath = mDictFilePath + DICT_FILE_NAME_SUFFIX_FOR_MIGRATION; + if (!migrateNative(mNativeDict, tmpDictFilePath, newFormatVersion)) { + return false; + } + close(); + final File dictFile = new File(mDictFilePath); + final File tmpDictFile = new File(tmpDictFilePath); + if (!FileUtils.deleteRecursively(dictFile)) { + return false; + } + if (!BinaryDictionaryUtils.renameDict(tmpDictFile, dictFile)) { + return false; + } + loadDictionary(dictFile.getAbsolutePath(), 0 /* startOffset */, + dictFile.length(), mIsUpdatable); + return true; + } finally { + isMigratingDir.delete(); + } + } + + public String getPropertyForGettingStats(final String query) { + if (!isValidDictionary()) { + return ""; + } + return getPropertyNative(mNativeDict, query); + } + + @Override + public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { + return candidate.mAutoCommitFirstWordConfidence > CONFIDENCE_TO_AUTO_COMMIT; + } + + @Override + public void close() { + synchronized (mDicTraverseSessions) { + final int sessionsSize = mDicTraverseSessions.size(); + for (int index = 0; index < sessionsSize; ++index) { + final DicTraverseSession traverseSession = mDicTraverseSessions.valueAt(index); + if (traverseSession != null) { + traverseSession.close(); + } + } + mDicTraverseSessions.clear(); + } + closeInternalLocked(); + } + + private synchronized void closeInternalLocked() { + if (mNativeDict != 0) { + closeNative(mNativeDict); + mNativeDict = 0; + } + } + + private String mDictFileHash; + public String getHash() { + if (mDictFileHash != null) return mDictFileHash; + final File dict = new File(mDictFilePath); + if (!dict.isFile()) { + mDictFileHash = ""; + return mDictFileHash; + } + mDictFileHash = ChecksumCalculator.INSTANCE.checksum(dict); + if (mDictFileHash == null) + mDictFileHash = ""; + return mDictFileHash; + } + + // TODO: Manage BinaryDictionary instances without using WeakReference or something. + @Override + protected void finalize() throws Throwable { + try { + closeInternalLocked(); + } finally { + super.finalize(); + } + } +} diff --git a/app/src/main/java/be/scri/inputmethod/latin/DicTraverseSession.java b/app/src/main/java/be/scri/inputmethod/latin/DicTraverseSession.java new file mode 100644 index 000000000..a712e882a --- /dev/null +++ b/app/src/main/java/be/scri/inputmethod/latin/DicTraverseSession.java @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.inputmethod.latin; + +import be.scri.latin.common.NativeSuggestOptions; +import be.scri.latin.define.DecoderSpecificConstants; +import be.scri.latin.utils.JniUtils; + +import java.util.Locale; + +public final class DicTraverseSession { + static { + JniUtils.loadNativeLibrary(); + } + // Must be equal to MAX_RESULTS in native/jni/src/defines.h + private static final int MAX_RESULTS = 18; + public final int[] mInputCodePoints = + new int[DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH]; + public final int[][] mPrevWordCodePointArrays = + new int[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM][]; + public final boolean[] mIsBeginningOfSentenceArray = + new boolean[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + public final int[] mOutputSuggestionCount = new int[1]; + public final int[] mOutputCodePoints = + new int[DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH * MAX_RESULTS]; + public final int[] mSpaceIndices = new int[MAX_RESULTS]; + public final int[] mOutputScores = new int[MAX_RESULTS]; + public final int[] mOutputTypes = new int[MAX_RESULTS]; + // Only one result is ever used + public final int[] mOutputAutoCommitFirstWordConfidence = new int[1]; + public final float[] mInputOutputWeightOfLangModelVsSpatialModel = new float[1]; + + public final NativeSuggestOptions mNativeSuggestOptions = new NativeSuggestOptions(); + + private static native long setDicTraverseSessionNative(String locale, long dictSize); + private static native void initDicTraverseSessionNative(long nativeDicTraverseSession, + long dictionary, int[] previousWord, int previousWordLength); + private static native void releaseDicTraverseSessionNative(long nativeDicTraverseSession); + + private long mNativeDicTraverseSession; + + public DicTraverseSession(Locale locale, long dictionary, long dictSize) { + mNativeDicTraverseSession = createNativeDicTraverseSession( + locale != null ? locale.toString() : "", dictSize); + initSession(dictionary); + } + + public long getSession() { + return mNativeDicTraverseSession; + } + + public void initSession(long dictionary) { + initSession(dictionary, null, 0); + } + + public void initSession(long dictionary, int[] previousWord, int previousWordLength) { + initDicTraverseSessionNative( + mNativeDicTraverseSession, dictionary, previousWord, previousWordLength); + } + + private static long createNativeDicTraverseSession(String locale, long dictSize) { + return setDicTraverseSessionNative(locale, dictSize); + } + + private void closeInternal() { + if (mNativeDicTraverseSession != 0) { + releaseDicTraverseSessionNative(mNativeDicTraverseSession); + mNativeDicTraverseSession = 0; + } + } + + public void close() { + closeInternal(); + } + + @Override + protected void finalize() throws Throwable { + try { + closeInternal(); + } finally { + super.finalize(); + } + } +} diff --git a/app/src/main/java/be/scri/inputmethod/latin/utils/BinaryDictionaryUtils.java b/app/src/main/java/be/scri/inputmethod/latin/utils/BinaryDictionaryUtils.java new file mode 100644 index 000000000..237c0c67f --- /dev/null +++ b/app/src/main/java/be/scri/inputmethod/latin/utils/BinaryDictionaryUtils.java @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.inputmethod.latin.utils; + +import be.scri.inputmethod.latin.BinaryDictionary; +import be.scri.latin.common.StringUtils; +import be.scri.latin.makedict.DictionaryHeader; +import be.scri.latin.makedict.UnsupportedFormatException; +import be.scri.latin.utils.JniUtils; + +import java.io.File; +import java.io.IOException; +import java.util.Locale; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public final class BinaryDictionaryUtils { + private static final String TAG = BinaryDictionaryUtils.class.getSimpleName(); + + private BinaryDictionaryUtils() { + // This utility class is not publicly instantiable. + } + + static { + JniUtils.loadNativeLibrary(); + } + + private static native boolean createEmptyDictFileNative(String filePath, long dictVersion, + String locale, String[] attributeKeyStringArray, String[] attributeValueStringArray); + private static native float calcNormalizedScoreNative(int[] before, int[] after, int score); + private static native int setCurrentTimeForTestNative(int currentTime); + + public static DictionaryHeader getHeader(final File dictFile) + throws IOException, UnsupportedFormatException { + return getHeaderWithOffsetAndLength(dictFile, 0 /* offset */, dictFile.length()); + } + + public static DictionaryHeader getHeaderWithOffsetAndLength(final File dictFile, + final long offset, final long length) throws IOException, UnsupportedFormatException { + // dictType is never used for reading the header. Passing an empty string. + final BinaryDictionary binaryDictionary = new BinaryDictionary( + dictFile.getAbsolutePath(), offset, length, + true /* useFullEditDistance */, null /* locale */, "" /* dictType */, + false /* isUpdatable */); + final DictionaryHeader header = binaryDictionary.getHeader(); + binaryDictionary.close(); + if (header == null) { + throw new IOException(); + } + return header; + } + + public static boolean renameDict(final File dictFile, final File newDictFile) { + if (dictFile.isFile()) { + return dictFile.renameTo(newDictFile); + } else if (dictFile.isDirectory()) { + final String dictName = dictFile.getName(); + final String newDictName = newDictFile.getName(); + if (newDictFile.exists()) { + return false; + } + for (final File file : dictFile.listFiles()) { + if (!file.isFile()) { + continue; + } + final String fileName = file.getName(); + final String newFileName = fileName.replaceFirst( + Pattern.quote(dictName), Matcher.quoteReplacement(newDictName)); + if (!file.renameTo(new File(dictFile, newFileName))) { + return false; + } + } + return dictFile.renameTo(newDictFile); + } + return false; + } + + public static boolean createEmptyDictFile(final String filePath, final long dictVersion, + final Locale locale, final Map attributeMap) { + final String[] keyArray = new String[attributeMap.size()]; + final String[] valueArray = new String[attributeMap.size()]; + int index = 0; + for (final String key : attributeMap.keySet()) { + keyArray[index] = key; + valueArray[index] = attributeMap.get(key); + index++; + } + return createEmptyDictFileNative(filePath, dictVersion, locale.toString(), keyArray, + valueArray); + } + + /** normalized score is >= 0, with 0 being a bad match, ~0.1 ok for autocorrect, and ~1.5 a very good match */ + public static float calcNormalizedScore(final String before, final String after, + final int score) { + return calcNormalizedScoreNative(StringUtils.toCodePointArray(before), + StringUtils.toCodePointArray(after), score); + } + + /** + * Control the current time to be used in the native code. If currentTime >= 0, this method sets + * the current time and gets into test mode. + * In test mode, set timestamp is used as the current time in the native code. + * If currentTime < 0, quit the test mode and returns to using time() to get the current time. + * + * @param currentTime seconds since the unix epoch + * @return current time got in the native code. + */ + public static int setCurrentTimeForTest(final int currentTime) { + return setCurrentTimeForTestNative(currentTime); + } +} diff --git a/app/src/main/java/be/scri/inputmethod/latin/utils/WordInputEventForPersonalization.java b/app/src/main/java/be/scri/inputmethod/latin/utils/WordInputEventForPersonalization.java new file mode 100644 index 000000000..e1a1e0ff2 --- /dev/null +++ b/app/src/main/java/be/scri/inputmethod/latin/utils/WordInputEventForPersonalization.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.inputmethod.latin.utils; + +import be.scri.latin.NgramContext; +import be.scri.latin.common.StringUtils; +import be.scri.latin.define.DecoderSpecificConstants; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +// Note: this class is used as a parameter type of a native method. You should be careful when you +// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). +public final class WordInputEventForPersonalization { + public final int[] mTargetWord; + public final int mPrevWordsCount; + public final int[][] mPrevWordArray = + new int[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM][]; + public final boolean[] mIsPrevWordBeginningOfSentenceArray = + new boolean[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + // Time stamp in seconds. + public final int mTimestamp; + + public WordInputEventForPersonalization(final CharSequence targetWord, + final NgramContext ngramContext, final int timestamp) { + mTargetWord = StringUtils.toCodePointArray(targetWord); + mPrevWordsCount = ngramContext.getPrevWordCount(); + ngramContext.outputToArray(mPrevWordArray, mIsPrevWordBeginningOfSentenceArray); + mTimestamp = timestamp; + } + + public static ArrayList createInputEventFrom( + final List tokens, final int timestamp, + final Object spacingAndPunctuations, final Locale locale) { + return new ArrayList(); + } +} diff --git a/app/src/main/java/be/scri/latin/NgramContext.java b/app/src/main/java/be/scri/latin/NgramContext.java new file mode 100644 index 000000000..c260fdb6d --- /dev/null +++ b/app/src/main/java/be/scri/latin/NgramContext.java @@ -0,0 +1,289 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin; + +import android.text.TextUtils; + +import androidx.annotation.NonNull; + +import be.scri.latin.common.StringUtils; +import be.scri.latin.define.DecoderSpecificConstants; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Class to represent information of previous words. This class is used to add n-gram entries + * into binary dictionaries, to get predictions, and to get suggestions. + */ +public class NgramContext { + @NonNull + public static final NgramContext EMPTY_PREV_WORDS_INFO = + new NgramContext(WordInfo.EMPTY_WORD_INFO); + @NonNull + public static final NgramContext BEGINNING_OF_SENTENCE = + new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO); + + public static final String BEGINNING_OF_SENTENCE_TAG = ""; + + public static final String CONTEXT_SEPARATOR = " "; + + public static NgramContext getEmptyPrevWordsContext(int maxPrevWordCount) { + return new NgramContext(maxPrevWordCount, WordInfo.EMPTY_WORD_INFO); + } + + /** + * Word information used to represent previous words information. + */ + public static class WordInfo { + @NonNull + public static final WordInfo EMPTY_WORD_INFO = new WordInfo(null); + @NonNull + public static final WordInfo BEGINNING_OF_SENTENCE_WORD_INFO = new WordInfo(); + + // This is an empty char sequence when mIsBeginningOfSentence is true. + public final CharSequence mWord; + // TODO: Have sentence separator. + // Whether the current context is beginning of sentence or not. This is true when composing + // at the beginning of an input field or composing a word after a sentence separator. + public final boolean mIsBeginningOfSentence; + + // Beginning of sentence. + private WordInfo() { + mWord = ""; + mIsBeginningOfSentence = true; + } + + public WordInfo(final CharSequence word) { + mWord = word; + mIsBeginningOfSentence = false; + } + + public boolean isValid() { + return mWord != null; + } + + @Override + public int hashCode() { + return Arrays.hashCode(new Object[] { mWord, mIsBeginningOfSentence } ); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof WordInfo wordInfo)) return false; + if (mWord == null || wordInfo.mWord == null) { + return mWord == wordInfo.mWord + && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence; + } + return TextUtils.equals(mWord, wordInfo.mWord) + && mIsBeginningOfSentence == wordInfo.mIsBeginningOfSentence; + } + } + + // The words immediately before the considered word. EMPTY_WORD_INFO element means we don't + // have any context for that previous word including the "beginning of sentence context" - we + // just don't know what to predict using the information. An example of that is after a comma. + // For simplicity of implementation, elements may also be EMPTY_WORD_INFO transiently after the + // WordComposer was reset and before starting a new composing word, but we should never be + // calling getSuggetions* in this situation. + private final WordInfo[] mPrevWordsInfo; + private final int mPrevWordsCount; + + private final int mMaxPrevWordCount; + + // Construct from the previous word information. + public NgramContext(final WordInfo... prevWordsInfo) { + this(DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM, prevWordsInfo); + } + + public NgramContext(final int maxPrevWordCount, final WordInfo... prevWordsInfo) { + mPrevWordsInfo = prevWordsInfo; + mPrevWordsCount = prevWordsInfo.length; + mMaxPrevWordCount = maxPrevWordCount; + } + + public boolean changeWordIfAfterBeginningOfSentence(final String from, final String to) { + boolean beginning = false; + for (int i = mPrevWordsCount - 1; i >= 0; i--) { + WordInfo info = mPrevWordsInfo[i]; + if (beginning && TextUtils.equals(info.mWord, from)) { + mPrevWordsInfo[i] = new WordInfo(to); + return true; + } + beginning = info.mIsBeginningOfSentence; + } + return false; + } + + /** + * Create next prevWordsInfo using current prevWordsInfo. + */ + @NonNull + public NgramContext getNextNgramContext(final WordInfo wordInfo) { + final int nextPrevWordCount = Math.min(mMaxPrevWordCount, mPrevWordsCount + 1); + final WordInfo[] prevWordsInfo = new WordInfo[nextPrevWordCount]; + prevWordsInfo[0] = wordInfo; + System.arraycopy(mPrevWordsInfo, 0, prevWordsInfo, 1, nextPrevWordCount - 1); + return new NgramContext(mMaxPrevWordCount, prevWordsInfo); + } + + + /** + * Extracts the previous words context. + * + * @return a String with the previous words separated by white space. + */ + public String extractPrevWordsContext() { + final ArrayList terms = new ArrayList<>(); + for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) { + if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) { + final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i]; + if (wordInfo.mIsBeginningOfSentence) { + terms.add(BEGINNING_OF_SENTENCE_TAG); + } else { + final String term = wordInfo.mWord.toString(); + if (!term.isEmpty()) { + terms.add(term); + } + } + } + } + return TextUtils.join(CONTEXT_SEPARATOR, terms); + } + + /** + * Extracts the previous words context. + * + * @return a String array with the previous words. + */ + public String[] extractPrevWordsContextArray() { + final ArrayList prevTermList = new ArrayList<>(); + for (int i = mPrevWordsInfo.length - 1; i >= 0; --i) { + if (mPrevWordsInfo[i] != null && mPrevWordsInfo[i].isValid()) { + final NgramContext.WordInfo wordInfo = mPrevWordsInfo[i]; + if (wordInfo.mIsBeginningOfSentence) { + prevTermList.add(BEGINNING_OF_SENTENCE_TAG); + } else { + final String term = wordInfo.mWord.toString(); + if (!term.isEmpty()) { + prevTermList.add(term); + } + } + } + } + return prevTermList.toArray(new String[0]); + } + + public boolean isValid() { + return mPrevWordsCount > 0 && mPrevWordsInfo[0].isValid(); + } + + public boolean isBeginningOfSentenceContext() { + return mPrevWordsCount > 0 && mPrevWordsInfo[0].mIsBeginningOfSentence; + } + + // n is 1-indexed. + // TODO: Remove + public CharSequence getNthPrevWord(final int n) { + if (n <= 0 || n > mPrevWordsCount) { + return null; + } + return mPrevWordsInfo[n - 1].mWord; + } + + // n is 1-indexed. + public boolean isNthPrevWordBeginningOfSentence(final int n) { + if (n <= 0 || n > mPrevWordsCount) { + return false; + } + return mPrevWordsInfo[n - 1].mIsBeginningOfSentence; + } + + public void outputToArray(final int[][] codePointArrays, + final boolean[] isBeginningOfSentenceArray) { + for (int i = 0; i < mPrevWordsCount; i++) { + final WordInfo wordInfo = mPrevWordsInfo[i]; + if (wordInfo == null || !wordInfo.isValid()) { + codePointArrays[i] = new int[0]; + isBeginningOfSentenceArray[i] = false; + continue; + } + codePointArrays[i] = StringUtils.toCodePointArray(wordInfo.mWord); + isBeginningOfSentenceArray[i] = wordInfo.mIsBeginningOfSentence; + } + } + + public int getPrevWordCount() { + return mPrevWordsCount; + } + + @Override + public int hashCode() { + int hashValue = 0; + for (final WordInfo wordInfo : mPrevWordsInfo) { + if (!WordInfo.EMPTY_WORD_INFO.equals(wordInfo)) { + break; + } + hashValue ^= wordInfo.hashCode(); + } + return hashValue; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof NgramContext prevWordsInfo)) return false; + + final int minLength = Math.min(mPrevWordsCount, prevWordsInfo.mPrevWordsCount); + for (int i = 0; i < minLength; i++) { + if (!mPrevWordsInfo[i].equals(prevWordsInfo.mPrevWordsInfo[i])) { + return false; + } + } + final WordInfo[] longerWordsInfo; + final int longerWordsInfoCount; + if (mPrevWordsCount > prevWordsInfo.mPrevWordsCount) { + longerWordsInfo = mPrevWordsInfo; + longerWordsInfoCount = mPrevWordsCount; + } else { + longerWordsInfo = prevWordsInfo.mPrevWordsInfo; + longerWordsInfoCount = prevWordsInfo.mPrevWordsCount; + } + for (int i = minLength; i < longerWordsInfoCount; i++) { + if (longerWordsInfo[i] != null + && !WordInfo.EMPTY_WORD_INFO.equals(longerWordsInfo[i])) { + return false; + } + } + return true; + } + + @Override + public String toString() { + final StringBuilder builder = new StringBuilder(); + for (int i = 0; i < mPrevWordsCount; i++) { + final WordInfo wordInfo = mPrevWordsInfo[i]; + builder.append("PrevWord["); + builder.append(i); + builder.append("]: "); + if (wordInfo == null) { + builder.append("null. "); + continue; + } + if (!wordInfo.isValid()) { + builder.append("Empty. "); + continue; + } + builder.append(wordInfo.mWord); + builder.append(", isBeginningOfSentence: "); + builder.append(wordInfo.mIsBeginningOfSentence); + builder.append(". "); + } + return builder.toString(); + } +} diff --git a/app/src/main/java/be/scri/latin/SuggestedWords.java b/app/src/main/java/be/scri/latin/SuggestedWords.java new file mode 100644 index 000000000..597f8e957 --- /dev/null +++ b/app/src/main/java/be/scri/latin/SuggestedWords.java @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin; + +import android.text.TextUtils; +import android.view.inputmethod.CompletionInfo; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; + +import be.scri.latin.common.StringUtils; +import be.scri.latin.common.StringUtilsKt; +import be.scri.latin.define.DebugFlags; +import be.scri.latin.dictionary.Dictionary; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; + +public class SuggestedWords { + public static final int INDEX_OF_TYPED_WORD = 0; + public static final int INDEX_OF_AUTO_CORRECTION = 1; + public static final int NOT_A_SEQUENCE_NUMBER = -1; + + public static final int INPUT_STYLE_NONE = 0; + public static final int INPUT_STYLE_TYPING = 1; + public static final int INPUT_STYLE_UPDATE_BATCH = 2; + public static final int INPUT_STYLE_TAIL_BATCH = 3; + public static final int INPUT_STYLE_APPLICATION_SPECIFIED = 4; + public static final int INPUT_STYLE_RECORRECTION = 5; + public static final int INPUT_STYLE_PREDICTION = 6; + public static final int INPUT_STYLE_BEGINNING_OF_SENTENCE_PREDICTION = 7; + + // The maximum number of suggestions available. + public static final int MAX_SUGGESTIONS = 18; + + private static final ArrayList EMPTY_WORD_INFO_LIST = new ArrayList<>(0); + @NonNull + private static final SuggestedWords EMPTY = new SuggestedWords( + EMPTY_WORD_INFO_LIST, null, null, false, + false, false, INPUT_STYLE_NONE, NOT_A_SEQUENCE_NUMBER); + + @NonNull + private static final SuggestedWords EMPTY_BATCH = new SuggestedWords( + EMPTY_WORD_INFO_LIST, null, null, false, + false, false, INPUT_STYLE_UPDATE_BATCH, NOT_A_SEQUENCE_NUMBER); + + @Nullable + public final SuggestedWordInfo mTypedWordInfo; + public final boolean mTypedWordValid; + // Note: this INCLUDES cases where the word will auto-correct to itself. A good definition + // of what this flag means would be "the top suggestion is strong enough to auto-correct", + // whether this exactly matches the user entry or not. + public final boolean mWillAutoCorrect; + public final boolean mIsObsoleteSuggestions; + // How the input for these suggested words was done by the user. Must be one of the + // INPUT_STYLE_* constants above. + public final int mInputStyle; + public final int mSequenceNumber; // Sequence number for auto-commit. + @NonNull + protected final ArrayList mSuggestedWordInfoList; + @Nullable + public final ArrayList mRawSuggestions; + + public SuggestedWords(@NonNull final ArrayList suggestedWordInfoList, + @Nullable final ArrayList rawSuggestions, + @Nullable final SuggestedWordInfo typedWordInfo, + final boolean typedWordValid, + final boolean willAutoCorrect, + final boolean isObsoleteSuggestions, + final int inputStyle, + final int sequenceNumber) { + mSuggestedWordInfoList = suggestedWordInfoList; + mRawSuggestions = rawSuggestions; + mTypedWordValid = typedWordValid; + mWillAutoCorrect = willAutoCorrect; + mIsObsoleteSuggestions = isObsoleteSuggestions; + mInputStyle = inputStyle; + mSequenceNumber = sequenceNumber; + mTypedWordInfo = typedWordInfo; + } + + public boolean isEmpty() { + return mSuggestedWordInfoList.isEmpty(); + } + + public int size() { + return mSuggestedWordInfoList.size(); + } + + /** + * Get suggested word to show as suggestions to UI. + * + * @return the count of suggested word to show as suggestions to UI. + */ + public int getWordCountToShow() { + if (isPrediction()) { + return size(); + } + return size() - /* typed word */ 1; + } + + /** + * Get {@link SuggestedWordInfo} object for the typed word. + * @return The {@link SuggestedWordInfo} object for the typed word. + */ + public SuggestedWordInfo getTypedWordInfo() { + return mTypedWordInfo; + } + + /** + * Get suggested word at index. + * @param index The index of the suggested word. + * @return The suggested word. + */ + public String getWord(final int index) { + return mSuggestedWordInfoList.get(index).mWord; + } + + /** + * Get displayed text at index. + * In RTL languages, the displayed text on the suggestion strip may be different from the + * suggested word that is returned from {@link #getWord(int)}. For example the displayed text + * of punctuation suggestion "(" should be ")". + * @param index The index of the text to display. + * @return The text to be displayed. + */ + public String getLabel(final int index) { + return mSuggestedWordInfoList.get(index).mWord; + } + + /** + * Get {@link SuggestedWordInfo} object at index. + * @param index The index of the {@link SuggestedWordInfo}. + * @return The {@link SuggestedWordInfo} object. + */ + public SuggestedWordInfo getInfo(final int index) { + return mSuggestedWordInfoList.get(index); + } + + /** + * Gets the suggestion index from the suggestions list. + * @param suggestedWordInfo The {@link SuggestedWordInfo} to find the index. + * @return The position of the suggestion in the suggestion list. + */ + public int indexOf(SuggestedWordInfo suggestedWordInfo) { + return mSuggestedWordInfoList.indexOf(suggestedWordInfo); + } + + public String getDebugString(final int pos) { + if (!DebugFlags.DEBUG_ENABLED) { + return null; + } + final SuggestedWordInfo wordInfo = getInfo(pos); + if (wordInfo == null) { + return null; + } + final String debugString = wordInfo.getDebugString(); + if (TextUtils.isEmpty(debugString)) { + return null; + } + return debugString; + } + + /** + * The predicator to tell whether this object represents punctuation suggestions. + * @return false if this object desn't represent punctuation suggestions. + */ + public boolean isPunctuationSuggestions() { + return false; + } + + @Override + public String toString() { + // Pretty-print method to help debug + return "SuggestedWords:" + + " mTypedWordValid=" + mTypedWordValid + + " mWillAutoCorrect=" + mWillAutoCorrect + + " mInputStyle=" + mInputStyle + + " words=" + Arrays.toString(mSuggestedWordInfoList.toArray()); + } + + public static ArrayList getFromApplicationSpecifiedCompletions( + final CompletionInfo[] infos) { + final ArrayList result = new ArrayList<>(); + for (final CompletionInfo info : infos) { + if (null == info || null == info.getText()) { + continue; + } + result.add(new SuggestedWordInfo(info)); + } + return result; + } + + @NonNull + public static SuggestedWords getEmptyInstance() { + return SuggestedWords.EMPTY; + } + + @NonNull + public static SuggestedWords getEmptyBatchInstance() { + return SuggestedWords.EMPTY_BATCH; + } + + // Should get rid of the first one (what the user typed previously) from suggestions + // and replace it with what the user currently typed. + public static ArrayList getTypedWordAndPreviousSuggestions( + @NonNull final SuggestedWordInfo typedWordInfo, + @NonNull final SuggestedWords previousSuggestions) { + final ArrayList suggestionsList = new ArrayList<>(); + final HashSet alreadySeen = new HashSet<>(); + suggestionsList.add(typedWordInfo); + alreadySeen.add(typedWordInfo.mWord); + final int previousSize = previousSuggestions.size(); + for (int index = 1; index < previousSize; index++) { + final SuggestedWordInfo prevWordInfo = previousSuggestions.getInfo(index); + final String prevWord = prevWordInfo.mWord; + // Filter out duplicate suggestions. + if (!alreadySeen.contains(prevWord)) { + suggestionsList.add(prevWordInfo); + alreadySeen.add(prevWord); + } + } + return suggestionsList; + } + + public SuggestedWordInfo getAutoCommitCandidate() { + if (mSuggestedWordInfoList.size() <= 0) return null; + final SuggestedWordInfo candidate = mSuggestedWordInfoList.get(0); + return candidate.isEligibleForAutoCommit() ? candidate : null; + } + + // non-final for testability. + public static class SuggestedWordInfo { + public static final int NOT_AN_INDEX = -1; + public static final int NOT_A_CONFIDENCE = -1; + public static final int MAX_SCORE = Integer.MAX_VALUE; + + private static final int KIND_MASK_KIND = 0xFF; // Mask to get only the kind + public static final int KIND_TYPED = 0; // What user typed + public static final int KIND_CORRECTION = 1; // Simple correction/suggestion + public static final int KIND_COMPLETION = 2; // Completion (suggestion with appended chars) + public static final int KIND_WHITELIST = 3; // Whitelisted word + public static final int KIND_BLACKLIST = 4; // Blacklisted word + public static final int KIND_HARDCODED = 5; // Hardcoded suggestion, e.g. punctuation + public static final int KIND_APP_DEFINED = 6; // Suggested by the application + public static final int KIND_SHORTCUT = 7; // A shortcut + public static final int KIND_PREDICTION = 8; // A prediction (== a suggestion with no input) + // KIND_RESUMED: A resumed suggestion (comes from a span, currently this type is used only + // in java for re-correction) + public static final int KIND_RESUMED = 9; + public static final int KIND_OOV_CORRECTION = 10; // Most probable string correction + + public static final int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; + public static final int KIND_FLAG_EXACT_MATCH = 0x40000000; + public static final int KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = 0x20000000; + public static final int KIND_FLAG_APPROPRIATE_FOR_AUTO_CORRECTION = 0x10000000; + + public final String mWord; + public final String mPrevWordsContext; + // The completion info from the application. Null for suggestions that don't come from + // the application (including keyboard-computed ones, so this is almost always null) + public final CompletionInfo mApplicationSpecifiedCompletionInfo; + public final int mScore; + public final int mKindAndFlags; + public final int mCodePointCount; + public final Dictionary mSourceDict; + // For auto-commit. This keeps track of the index inside the touch coordinates array + // passed to native code to get suggestions for a gesture that corresponds to the first + // letter of the second word. + public final int mIndexOfTouchPointOfSecondWord; + // For auto-commit. This is a measure of how confident we are that we can commit the + // first word of this suggestion. + public final int mAutoCommitFirstWordConfidence; + private String mDebugString = ""; + private Boolean mIsEmoji; + + /** + * Create a new suggested word info. + * @param word The string to suggest. + * @param prevWordsContext previous words context. + * @param score A measure of how likely this suggestion is. + * @param kindAndFlags The kind of suggestion, as one of the above KIND_* constants with + * flags. + * @param sourceDict What instance of Dictionary produced this suggestion. + * @param indexOfTouchPointOfSecondWord See mIndexOfTouchPointOfSecondWord. + * @param autoCommitFirstWordConfidence See mAutoCommitFirstWordConfidence. + */ + public SuggestedWordInfo(final String word, final String prevWordsContext, + final int score, final int kindAndFlags, + final Dictionary sourceDict, final int indexOfTouchPointOfSecondWord, + final int autoCommitFirstWordConfidence) { + mWord = word; + mPrevWordsContext = prevWordsContext; + mApplicationSpecifiedCompletionInfo = null; + mScore = score; + mKindAndFlags = kindAndFlags; + mSourceDict = sourceDict; + mCodePointCount = StringUtils.codePointCount(mWord); + mIndexOfTouchPointOfSecondWord = indexOfTouchPointOfSecondWord; + mAutoCommitFirstWordConfidence = autoCommitFirstWordConfidence; + } + + /** + * Create a new suggested word info from an application-specified completion. + * If the passed argument or its contained text is null, this throws a NPE. + * @param applicationSpecifiedCompletion The application-specified completion info. + */ + public SuggestedWordInfo(final CompletionInfo applicationSpecifiedCompletion) { + mWord = applicationSpecifiedCompletion.getText().toString(); + mPrevWordsContext = ""; + mApplicationSpecifiedCompletionInfo = applicationSpecifiedCompletion; + mScore = SuggestedWordInfo.MAX_SCORE; + mKindAndFlags = SuggestedWordInfo.KIND_APP_DEFINED; + mSourceDict = Dictionary.DICTIONARY_APPLICATION_DEFINED; + mCodePointCount = StringUtils.codePointCount(mWord); + mIndexOfTouchPointOfSecondWord = SuggestedWordInfo.NOT_AN_INDEX; + mAutoCommitFirstWordConfidence = SuggestedWordInfo.NOT_A_CONFIDENCE; + } + + public boolean isEligibleForAutoCommit() { + return (isKindOf(KIND_CORRECTION) && NOT_AN_INDEX != mIndexOfTouchPointOfSecondWord); + } + + public int getKind() { + return (mKindAndFlags & KIND_MASK_KIND); + } + + public boolean isKindOf(final int kind) { + return getKind() == kind; + } + + public boolean isPossiblyOffensive() { + return (mKindAndFlags & KIND_FLAG_POSSIBLY_OFFENSIVE) != 0; + } + + public boolean isExactMatch() { + return (mKindAndFlags & KIND_FLAG_EXACT_MATCH) != 0; + } + + public boolean isExactMatchWithIntentionalOmission() { + return (mKindAndFlags & KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) != 0; + } + + public boolean isAppropriateForAutoCorrection() { + return (mKindAndFlags & KIND_FLAG_APPROPRIATE_FOR_AUTO_CORRECTION) != 0; + } + + public void setDebugString(final String str) { + if (null == str) throw new NullPointerException("Debug info is null"); + mDebugString = str; + } + + public String getDebugString() { + return mDebugString; + } + + public String getWord() { + return mWord; + } + + public boolean isEmoji() { + if (mIsEmoji == null) { + mIsEmoji = StringUtilsKt.isEmoji(mWord); + } + return mIsEmoji; + } + + @Deprecated + public Dictionary getSourceDictionary() { + return mSourceDict; + } + + public int codePointAt(int i) { + return mWord.codePointAt(i); + } + + @Override + public String toString() { + if (TextUtils.isEmpty(mDebugString)) { + return mWord; + } + return mWord + " (" + mDebugString + ")"; + } + + /** + * This will always remove the higher index if a duplicate is found. + * Will also remove all occurrences of the typed word. + * + * @return position of typed word in the candidate list + */ + public static int removeDupsAndTypedWord( + @Nullable final String typedWord, + @NonNull final ArrayList candidates) { + if (candidates.isEmpty()) { + return -1; + } + int firstOccurrenceOfWord = -1; + if (typedWord != null && typedWord.length() > 0) { + firstOccurrenceOfWord = removeSuggestedWordInfoFromList( + typedWord, candidates, -1 /* startIndexExclusive */); + } + for (int i = 0; i < candidates.size(); ++i) { + removeSuggestedWordInfoFromList( + candidates.get(i).mWord, candidates, i /* startIndexExclusive */); + } + return firstOccurrenceOfWord; + } + + private static int removeSuggestedWordInfoFromList( + @NonNull final String word, + @NonNull final ArrayList candidates, + final int startIndexExclusive) { + int firstOccurrenceOfWord = -1; + for (int i = startIndexExclusive + 1; i < candidates.size(); ++i) { + final SuggestedWordInfo previous = candidates.get(i); + if (word.equals(previous.mWord)) { + if (firstOccurrenceOfWord == -1) { + firstOccurrenceOfWord = i; + } + candidates.remove(i); + --i; + } + } + return firstOccurrenceOfWord; + } + } + + private static boolean isPrediction(final int inputStyle) { + return INPUT_STYLE_PREDICTION == inputStyle + || INPUT_STYLE_BEGINNING_OF_SENTENCE_PREDICTION == inputStyle; + } + + public boolean isPrediction() { + return isPrediction(mInputStyle); + } + + /** + * @return the {@link SuggestedWordInfo} which corresponds to the word that is originally + * typed by the user. Otherwise returns {@code null}. Note that gesture input is not + * considered to be a typed word. + */ + public SuggestedWordInfo getTypedWordInfoOrNull() { + if (SuggestedWords.INDEX_OF_TYPED_WORD >= size()) { + return null; + } + final SuggestedWordInfo info = getInfo(SuggestedWords.INDEX_OF_TYPED_WORD); + return (info.getKind() == SuggestedWordInfo.KIND_TYPED) ? info : null; + } +} diff --git a/app/src/main/java/be/scri/latin/common/CollectionUtils.java b/app/src/main/java/be/scri/latin/common/CollectionUtils.java new file mode 100644 index 000000000..df9418926 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/CollectionUtils.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +import androidx.annotation.NonNull; + +import java.util.ArrayList; + +/** + * Utility methods for working with collections. + */ +public final class CollectionUtils { + private CollectionUtils() { + // This utility class is not publicly instantiable. + } + + /** + * Converts a sub-range of the given array to an ArrayList of the appropriate type. + * @param array Array to be converted. + * @param start First index inclusive to be converted. + * @param end Last index exclusive to be converted. + * @throws IllegalArgumentException if start or end are out of range or start > end. + */ + @NonNull + public static ArrayList arrayAsList(@NonNull final E[] array, final int start, + final int end) { + if (start < 0 || start > end || end > array.length) { + throw new IllegalArgumentException("Invalid start: " + start + " end: " + end + + " with array.length: " + array.length); + } + + final ArrayList list = new ArrayList<>(end - start); + for (int i = start; i < end; i++) { + list.add(array[i]); + } + return list; + } +} diff --git a/app/src/main/java/be/scri/latin/common/ComposedData.kt b/app/src/main/java/be/scri/latin/common/ComposedData.kt new file mode 100644 index 000000000..f51fd7234 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/ComposedData.kt @@ -0,0 +1,62 @@ +@file:Suppress("ktlint", "detekt.all") + +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ +package be.scri.latin.common + +import kotlin.random.Random + +/** An immutable class that encapsulates a snapshot of word composition data. */ +class ComposedData( + @JvmField val mInputPointers: InputPointers, + @JvmField val mIsBatchMode: Boolean, + @JvmField val mTypedWord: String +) { + /** + * Copy the code points in the typed word to a destination array of ints. + * + * If the array is too small to hold the code points in the typed word, nothing is copied and + * -1 is returned. + * + * @param destination the array of ints. + * @return the number of copied code points. + */ + fun copyCodePointsExceptTrailingSingleQuotesAndReturnCodePointCount( + destination: IntArray + ): Int { + // lastIndex is exclusive + val lastIndex = (mTypedWord.length - StringUtils.getTrailingSingleQuotesCount(mTypedWord)) + if (lastIndex <= 0) { + return 0 // The string is empty or contains only single quotes. + } + + // The following function counts the number of code points in the text range which begins + // at index 0 and extends to the character at lastIndex. + val codePointSize = Character.codePointCount(mTypedWord, 0, lastIndex) + if (codePointSize > destination.size) { + return -1 + } + return StringUtils.copyCodePointsAndReturnCodePointCount( + destination, mTypedWord, 0, lastIndex, true + ) + } + + companion object { + fun createForWord(word: String): ComposedData { + val codePoints = StringUtils.toCodePointArray(word) + val coordinates = CoordinateUtils.newCoordinateArray(codePoints.size) + for (i in codePoints.indices) { + CoordinateUtils.setXYInArray(coordinates, i, Random.nextBits(2), Random.nextBits(2)) + } + val pointers = InputPointers(codePoints.size).apply { + for (i in codePoints.indices) { + addPointer(CoordinateUtils.xFromArray(coordinates, i), CoordinateUtils.yFromArray(coordinates, i), 0, 0) + } + } + return ComposedData(pointers, false, word) + } + } +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/common/Constants.java b/app/src/main/java/be/scri/latin/common/Constants.java new file mode 100644 index 000000000..e3e0df51a --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/Constants.java @@ -0,0 +1,295 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +import androidx.annotation.NonNull; + +import be.scri.BuildConfig; + +public final class Constants { + + public static final class KeyCode { + public static final int SHIFT = -1; + public static final int CAPS_LOCK = -2; + public static final int SYMBOL_ALPHA = -3; + public static final int ALPHA = -4; + public static final int SYMBOL = -5; + public static final int MULTIPLE_CODE_POINTS = -6; + public static final int DELETE = -7; + public static final int SETTINGS = -8; + public static final int VOICE_INPUT = -9; + public static final int ACTION_NEXT = -10; + public static final int ACTION_PREVIOUS = -11; + public static final int LANGUAGE_SWITCH = -12; + public static final int EMOJI = -13; + public static final int CLIPBOARD = -14; + public static final int SHIFT_ENTER = -15; + public static final int NOT_SPECIFIED = -99; + public static final int TOGGLE_ONE_HANDED_MODE = -16; + public static final int SWITCH_ONE_HANDED_MODE = -17; + public static final int SPLIT_LAYOUT = -18; + public static final int NUMPAD = -19; + public static final int EMOJI_SEARCH = -20; + } + + public static final class Color { + /** + * The alpha value for fully opaque. + */ + public final static int ALPHA_OPAQUE = 255; + } + + public static final class ImeOption { + /** + * The private IME option used to indicate that no microphone should be shown for a given + * text field. For instance, this is specified by the search dialog when the dialog is + * already showing a voice search button. + */ + public static final String NO_MICROPHONE = "noMicrophoneKey"; + + /** + * The private IME option used to suppress the floating gesture preview for a given text + * field. This overrides the corresponding keyboard settings preference. + * {@link be.scri.latin.settings.SettingsValues#mGestureFloatingPreviewTextEnabled} + */ + public static final String NO_FLOATING_GESTURE_PREVIEW = "noGestureFloatingPreview"; + + private ImeOption() { + // This utility class is not publicly instantiable. + } + } + + public static final class Subtype { + /** The subtype mode used to indicate that the subtype is a keyboard. */ + public static final String KEYBOARD_MODE = "keyboard"; + + // some extra values: + // TrySuppressingImeSwitcher: not documented, but used in Android source + // SupportTouchPositionCorrection: never read, never used outside AOSP keyboard -> can be removed? + public static final class ExtraValue { + /** Indicates that this subtype is capable of entering ASCII characters (not used, but recommended for Android 9 and older). */ + public static final String ASCII_CAPABLE = "AsciiCapable"; + + /** Indicates that this subtype is enabled when the default subtype is not marked as ascii capable (used where?). */ + public static final String ENABLED_WHEN_DEFAULT_IS_NOT_ASCII_CAPABLE = "EnabledWhenDefaultIsNotAsciiCapable"; + + /** Indicates that this subtype is capable of entering emoji characters (always set?). */ + public static final String EMOJI_CAPABLE = "EmojiCapable"; + + /** Indicates that the subtype does not have a shift key */ + public static final String NO_SHIFT_KEY = "NoShiftKey"; + + /** Indicates that for this subtype corrections should not be based on proximity of keys for when shifted */ + public static final String NO_SHIFT_PROXIMITY_CORRECTION = "NoShiftProximityCorrection"; + + /** + * The subtype extra value used to indicate that the display name of this subtype + * contains a "%s" for printf-like replacement and it should be replaced by + * this extra value. + * This extra value is supported on JellyBean and later. + */ + public static final String UNTRANSLATABLE_STRING_IN_SUBTYPE_NAME = "UntranslatableReplacementStringInSubtypeName"; + + /** Contains the layouts used by this subtype. This extra value is private to LatinIME.*/ + public static final String KEYBOARD_LAYOUT_SET = "KeyboardLayoutSet"; + + /** Indicates that this subtype is an additional subtype that the user defined. This extra value is private to LatinIME. */ + public static final String IS_ADDITIONAL_SUBTYPE = "isAdditionalSubtype"; + + /** The subtype extra value used to specify the combining rules. */ + public static final String COMBINING_RULES = "CombiningRules"; + + /** Overrides the general popup order setting */ + public static final String POPUP_ORDER = "PopupOrder"; + + /** Overrides the general hint order / priority setting */ + public static final String HINT_ORDER = "HintOrder"; + + /** Language tags indicating enabled secondary locales */ + public static final String SECONDARY_LOCALES = "SecondaryLocales"; + + /** Overrides the general "more popups" setting */ + public static final String MORE_POPUPS = "MorePopups"; + + /** Overrides the general "localized number row" setting */ + public static final String LOCALIZED_NUMBER_ROW = "LocalizedNumberRow"; + + private ExtraValue() { + // This utility class is not publicly instantiable. + } + } + + private Subtype() { + // This utility class is not publicly instantiable. + } + } + + /** Separators for use in extra values and preferences. Notably cannot be = and , as they are already used in extra values */ + public static final class Separators { + /** key-value separator (to be used in subtype extra values) */ + public static final String KV = ":"; + /** separator between entries that might be key-value pairs (to be used in subtype extra values) */ + public static final String ENTRY = "|"; + /** separator between sets of entries (to be used for storing data for additional subtypes) */ + public static final String SET = "§"; + /** separator for sets (to be used for storing multiple extra additional subtypes in prefs) */ + public static final String SETS = ";"; + } + + public static final class TextUtils { + /** + * Capitalization mode for {@link android.text.TextUtils#getCapsMode}: don't capitalize + * characters. This value may be used with + * {@link android.text.TextUtils#CAP_MODE_CHARACTERS}, + * {@link android.text.TextUtils#CAP_MODE_WORDS}, and + * {@link android.text.TextUtils#CAP_MODE_SENTENCES}. + */ + // TODO: Straighten this out. It's bizarre to have to use android.text.TextUtils.CAP_MODE_* + // except for OFF that is in Constants.TextUtils. + public static final int CAP_MODE_OFF = 0; + + private TextUtils() { + // This utility class is not publicly instantiable. + } + } + + public static final int NOT_A_CODE = -1; + public static final int NOT_A_CURSOR_POSITION = -1; + // TODO: replace the following constants with state in InputTransaction? + public static final int NOT_A_COORDINATE = -1; + public static final int SUGGESTION_STRIP_COORDINATE = -2; + public static final int EXTERNAL_KEYBOARD_COORDINATE = -4; + + // A hint on how many characters to cache from the TextView. A good value of this is given by + // how many characters we need to be able to almost always find the caps mode. + public static final int EDITOR_CONTENTS_CACHE_SIZE = 1024; + // How many characters we accept for the recapitalization functionality. This needs to be + // large enough for all reasonable purposes, but avoid purposeful attacks. 100k sounds about + // right for this. + public static final int MAX_CHARACTERS_FOR_RECAPITALIZATION = 1024 * 100; + + // Key events coming any faster than this are long-presses. + public static final int LONG_PRESS_MILLISECONDS = 200; + // TODO: Set this value appropriately. + public static final int GET_SUGGESTED_WORDS_TIMEOUT = BuildConfig.DEBUG ? 500 : 200; // debug build is slow, and timeout is annoying for testing + // How many continuous deletes at which to start deleting at a higher speed. + public static final int DELETE_ACCELERATE_AT = 20; + + public static final String WORD_SEPARATOR = " "; + + public static boolean isValidCoordinate(final int coordinate) { + // Detect {@link NOT_A_COORDINATE}, {@link SUGGESTION_STRIP_COORDINATE}, + // and {@link SPELL_CHECKER_COORDINATE}. + return coordinate >= 0; + } + + /** + * Custom request code used in + * {@link be.scri.keyboard.KeyboardActionListener#onCustomRequest(int)}. + */ + // The code to show input method picker. + public static final int CUSTOM_CODE_SHOW_INPUT_METHOD_PICKER = 1; + + /** + * Some common keys code. Must be positive. + */ + public static final int CODE_ENTER = '\n'; + public static final int CODE_TAB = '\t'; + public static final int CODE_SPACE = ' '; + public static final int CODE_PERIOD = '.'; + public static final int CODE_COMMA = ','; + public static final int CODE_DASH = '-'; + public static final int CODE_SINGLE_QUOTE = '\''; + public static final int CODE_DOUBLE_QUOTE = '"'; + public static final int CODE_SLASH = '/'; + public static final int CODE_BACKSLASH = '\\'; + public static final int CODE_VERTICAL_BAR = '|'; + public static final int CODE_COMMERCIAL_AT = '@'; + public static final int CODE_PLUS = '+'; + public static final int CODE_PERCENT = '%'; + public static final int CODE_CLOSING_PARENTHESIS = ')'; + public static final int CODE_CLOSING_SQUARE_BRACKET = ']'; + public static final int CODE_CLOSING_CURLY_BRACKET = '}'; + public static final int CODE_CLOSING_ANGLE_BRACKET = '>'; + public static final int CODE_INVERTED_QUESTION_MARK = '¿'; + public static final int CODE_INVERTED_EXCLAMATION_MARK = '¡'; + public static final int CODE_GRAVE_ACCENT = '`'; + public static final int CODE_CIRCUMFLEX_ACCENT = '^'; + public static final int CODE_TILDE = '~'; + public static final int RECENTS_TEMPLATE_KEY_CODE_0 = 0x30; + public static final int RECENTS_TEMPLATE_KEY_CODE_1 = 0x31; + + public static final String REGEXP_PERIOD = "\\."; + public static final String STRING_SPACE = " "; + + public static boolean isLetterCode(final int code) { + return code >= CODE_SPACE; + } + + @NonNull + public static String printableCode(final int code) { + switch (code) { + case KeyCode.SHIFT: return "shift"; + case KeyCode.CAPS_LOCK: return "capslock"; + case KeyCode.SYMBOL_ALPHA: return "symbol_alpha"; + case KeyCode.ALPHA: return "alpha"; + case KeyCode.SYMBOL: return "symbol"; + case KeyCode.MULTIPLE_CODE_POINTS: return "text"; + case KeyCode.DELETE: return "delete"; + case KeyCode.SETTINGS: return "settings"; + case KeyCode.VOICE_INPUT: return "shortcut"; + case KeyCode.ACTION_NEXT: return "actionNext"; + case KeyCode.ACTION_PREVIOUS: return "actionPrevious"; + case KeyCode.LANGUAGE_SWITCH: return "languageSwitch"; + case KeyCode.EMOJI: return "emoji"; + case KeyCode.CLIPBOARD: return "clipboard"; + case KeyCode.SHIFT_ENTER: return "shiftEnter"; + case KeyCode.NOT_SPECIFIED: return "unspec"; + case CODE_TAB: return "tab"; + case CODE_ENTER: return "enter"; + case CODE_SPACE: return "space"; + case KeyCode.TOGGLE_ONE_HANDED_MODE: return "toggleOneHandedMode"; + case KeyCode.SWITCH_ONE_HANDED_MODE: return "switchOneHandedMode"; + case KeyCode.SPLIT_LAYOUT: return "splitLayout"; + case KeyCode.NUMPAD: return "numpad"; + case KeyCode.EMOJI_SEARCH: return "emojiSearch"; + default: + if (code < CODE_SPACE) return String.format("\\u%02X", code); + if (code < 0x100) return String.format("%c", code); + if (code < 0x10000) return String.format("\\u%04X", code); + return String.format("\\U%05X", code); + } + } + + /** + * Screen metrics (a.k.a. Device form factor) constants of + * {@link be.scri.latin.R.integer#config_screen_metrics}. + */ + public static final int SCREEN_METRICS_SMALL_PHONE = 0; + public static final int SCREEN_METRICS_LARGE_PHONE = 1; + public static final int SCREEN_METRICS_LARGE_TABLET = 2; + public static final int SCREEN_METRICS_SMALL_TABLET = 3; + + /** + * Default capacity of gesture points container. + * This constant is used by {@link be.scri.keyboard.internal.BatchInputArbiter} + * and etc. to preallocate regions that contain gesture event points. + */ + public static final int DEFAULT_GESTURE_POINTS_CAPACITY = 128; + + public static final int MAX_IME_DECODER_RESULTS = 20; + public static final int DECODER_SCORE_SCALAR = 1000000; + public static final int DECODER_MAX_SCORE = 1000000000; + + public static final int EVENT_BACKSPACE = 1; + public static final int EVENT_REJECTION = 2; + public static final int EVENT_REVERT = 3; + + private Constants() { + // This utility class is not publicly instantiable. + } +} diff --git a/app/src/main/java/be/scri/latin/common/Constants.kt b/app/src/main/java/be/scri/latin/common/Constants.kt new file mode 100644 index 000000000..a4b483020 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/Constants.kt @@ -0,0 +1,20 @@ +@file:Suppress("ktlint", "detekt.all") + +// SPDX-License-Identifier: GPL-3.0-only +package be.scri.latin.common + +object Links { + const val DICTIONARY_URL = "https://codeberg.org/Helium314/aosp-dictionaries" + const val DICTIONARY_DOWNLOAD_SUFFIX = "/raw/branch/main/" + const val DICTIONARY_NORMAL_SUFFIX = "dictionaries/" + const val DICTIONARY_EXPERIMENTAL_SUFFIX = "dictionaries_experimental/" + const val DICTIONARY_EMOJI_CLDR_SUFFIX = "emoji_cldr_signal_dictionaries/" + const val GITHUB = "https://github.com/Helium314/HeliBoard" + const val LICENSE = "$GITHUB/blob/main/LICENSE" + const val LAYOUT_WIKI_URL = "$GITHUB/wiki/2.-Layouts" + const val WIKI_URL = "$GITHUB/wiki" + const val CUSTOM_LAYOUTS = "$GITHUB/discussions/categories/custom-layout" + const val CUSTOM_COLORS = "$GITHUB/discussions/categories/custom-colors" +} + +val combiningRange = 0x300..0x35b \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/common/CoordinateUtils.java b/app/src/main/java/be/scri/latin/common/CoordinateUtils.java new file mode 100644 index 000000000..6c22c60df --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/CoordinateUtils.java @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +import androidx.annotation.NonNull; + +public final class CoordinateUtils { + private static final int INDEX_X = 0; + private static final int INDEX_Y = 1; + private static final int ELEMENT_SIZE = INDEX_Y + 1; + + private CoordinateUtils() { + // This utility class is not publicly instantiable. + } + + @NonNull + public static int[] newInstance() { + return new int[ELEMENT_SIZE]; + } + + public static int x(@NonNull final int[] coords) { + return coords[INDEX_X]; + } + + public static int y(@NonNull final int[] coords) { + return coords[INDEX_Y]; + } + + public static void set(@NonNull final int[] coords, final int x, final int y) { + coords[INDEX_X] = x; + coords[INDEX_Y] = y; + } + + public static void copy(@NonNull final int[] destination, @NonNull final int[] source) { + destination[INDEX_X] = source[INDEX_X]; + destination[INDEX_Y] = source[INDEX_Y]; + } + + @NonNull + public static int[] newCoordinateArray(final int arraySize) { + return new int[ELEMENT_SIZE * arraySize]; + } + + @NonNull + public static int[] newCoordinateArray(final int arraySize, + final int defaultX, final int defaultY) { + final int[] result = new int[ELEMENT_SIZE * arraySize]; + for (int i = 0; i < arraySize; ++i) { + setXYInArray(result, i, defaultX, defaultY); + } + return result; + } + + public static int xFromArray(@NonNull final int[] coordsArray, final int index) { + return coordsArray[ELEMENT_SIZE * index + INDEX_X]; + } + + public static int yFromArray(@NonNull final int[] coordsArray, final int index) { + return coordsArray[ELEMENT_SIZE * index + INDEX_Y]; + } + + @NonNull + public static int[] coordinateFromArray(@NonNull final int[] coordsArray, final int index) { + final int[] coords = newInstance(); + set(coords, xFromArray(coordsArray, index), yFromArray(coordsArray, index)); + return coords; + } + + public static void setXYInArray(@NonNull final int[] coordsArray, final int index, + final int x, final int y) { + final int baseIndex = ELEMENT_SIZE * index; + coordsArray[baseIndex + INDEX_X] = x; + coordsArray[baseIndex + INDEX_Y] = y; + } + + public static void setCoordinateInArray(@NonNull final int[] coordsArray, final int index, + @NonNull final int[] coords) { + setXYInArray(coordsArray, index, x(coords), y(coords)); + } +} diff --git a/app/src/main/java/be/scri/latin/common/FileUtils.java b/app/src/main/java/be/scri/latin/common/FileUtils.java new file mode 100644 index 000000000..8faa45ffd --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/FileUtils.java @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +import android.content.Context; +import android.net.Uri; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.concurrent.CountDownLatch; + +/** + * A simple class to help with removing directories recursively. + */ +public class FileUtils { + + public static boolean deleteRecursively(final File path) { + if (path.isDirectory()) { + final File[] files = path.listFiles(); + if (files != null) { + for (final File child : files) { + deleteRecursively(child); + } + } + } + return path.delete(); + } + + public static boolean deleteFilteredFiles(final File dir, final FilenameFilter fileNameFilter) { + if (!dir.isDirectory()) { + return false; + } + final File[] files = dir.listFiles(fileNameFilter); + if (files == null) { + return false; + } + boolean hasDeletedAllFiles = true; + for (final File file : files) { + if (!deleteRecursively(file)) { + hasDeletedAllFiles = false; + } + } + return hasDeletedAllFiles; + } + + /** + * copy data to file on different thread to avoid NetworkOnMainThreadException + * still effectively blocking, as we only use small files which are mostly stored locally + */ + public static void copyContentUriToNewFile(final Uri uri, final Context context, final File outfile) throws IOException { + copyStreamToNewFile(context.getContentResolver().openInputStream(uri), outfile); + } + + public static void copyStreamToNewFile(final InputStream in, final File outfile) throws IOException { + File parentFile = outfile.getParentFile(); + if (parentFile == null || (!parentFile.exists() && !parentFile.mkdirs())) { + throw new IOException("could not create parent folder"); + } + FileOutputStream out = new FileOutputStream(outfile); + copyStreamToOtherStream(in, out); + out.close(); + } + + public static void copyStreamToOtherStream(final InputStream in, final OutputStream out) throws IOException { + byte[] buf = new byte[1024]; + int len; + while ((len = in.read(buf)) > 0) { + out.write(buf, 0, len); + } + out.flush(); + } + +} diff --git a/app/src/main/java/be/scri/latin/common/InputPointers.java b/app/src/main/java/be/scri/latin/common/InputPointers.java new file mode 100644 index 000000000..e821eccba --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/InputPointers.java @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +import androidx.annotation.NonNull; + +// TODO: This class is not thread-safe. +public final class InputPointers { + private static final boolean DEBUG_TIME = false; + + private final int mDefaultCapacity; + private final ResizableIntArray mXCoordinates; + private final ResizableIntArray mYCoordinates; + private final ResizableIntArray mPointerIds; + private final ResizableIntArray mTimes; + + public InputPointers(final int defaultCapacity) { + mDefaultCapacity = defaultCapacity; + mXCoordinates = new ResizableIntArray(defaultCapacity); + mYCoordinates = new ResizableIntArray(defaultCapacity); + mPointerIds = new ResizableIntArray(defaultCapacity); + mTimes = new ResizableIntArray(defaultCapacity); + } + + private void fillWithLastTimeUntil(final int index) { + final int fromIndex = mTimes.getLength(); + // Fill the gap with the latest time. + // See {@link #getTime(int)} and {@link #isValidTimeStamps()}. + if (fromIndex <= 0) { + return; + } + final int fillLength = index - fromIndex + 1; + if (fillLength <= 0) { + return; + } + final int lastTime = mTimes.get(fromIndex - 1); + mTimes.fill(lastTime, fromIndex, fillLength); + } + + public void addPointerAt(final int index, final int x, final int y, final int pointerId, + final int time) { + mXCoordinates.addAt(index, x); + mYCoordinates.addAt(index, y); + mPointerIds.addAt(index, pointerId); + if (DEBUG_TIME) { + fillWithLastTimeUntil(index); + } + mTimes.addAt(index, time); + } + + public void addPointer(final int x, final int y, final int pointerId, final int time) { + mXCoordinates.add(x); + mYCoordinates.add(y); + mPointerIds.add(pointerId); + mTimes.add(time); + } + + public void set(@NonNull final InputPointers ip) { + mXCoordinates.set(ip.mXCoordinates); + mYCoordinates.set(ip.mYCoordinates); + mPointerIds.set(ip.mPointerIds); + mTimes.set(ip.mTimes); + } + + public void copy(@NonNull final InputPointers ip) { + mXCoordinates.copy(ip.mXCoordinates); + mYCoordinates.copy(ip.mYCoordinates); + mPointerIds.copy(ip.mPointerIds); + mTimes.copy(ip.mTimes); + } + + /** + * Append the times, x-coordinates and y-coordinates in the specified {@link ResizableIntArray} + * to the end of this. + * @param pointerId the pointer id of the source. + * @param times the source {@link ResizableIntArray} to read the event times from. + * @param xCoordinates the source {@link ResizableIntArray} to read the x-coordinates from. + * @param yCoordinates the source {@link ResizableIntArray} to read the y-coordinates from. + * @param startPos the starting index of the data in {@code times} and etc. + * @param length the number of data to be appended. + */ + public void append(final int pointerId, @NonNull final ResizableIntArray times, + @NonNull final ResizableIntArray xCoordinates, + @NonNull final ResizableIntArray yCoordinates, final int startPos, final int length) { + if (length == 0) { + return; + } + mXCoordinates.append(xCoordinates, startPos, length); + mYCoordinates.append(yCoordinates, startPos, length); + mPointerIds.fill(pointerId, mPointerIds.getLength(), length); + mTimes.append(times, startPos, length); + } + + /** + * Shift to the left by elementCount, discarding elementCount pointers at the start. + * @param elementCount how many elements to shift. + */ + public void shift(final int elementCount) { + mXCoordinates.shift(elementCount); + mYCoordinates.shift(elementCount); + mPointerIds.shift(elementCount); + mTimes.shift(elementCount); + } + + public void reset() { + final int defaultCapacity = mDefaultCapacity; + mXCoordinates.reset(defaultCapacity); + mYCoordinates.reset(defaultCapacity); + mPointerIds.reset(defaultCapacity); + mTimes.reset(defaultCapacity); + } + + public int getPointerSize() { + return mXCoordinates.getLength(); + } + + @NonNull + public int[] getXCoordinates() { + return mXCoordinates.getPrimitiveArray(); + } + + @NonNull + public int[] getYCoordinates() { + return mYCoordinates.getPrimitiveArray(); + } + + @NonNull + public int[] getPointerIds() { + return mPointerIds.getPrimitiveArray(); + } + + /** + * Gets the time each point was registered, in milliseconds, relative to the first event in the + * sequence. + * @return The time each point was registered, in milliseconds, relative to the first event in + * the sequence. + */ + @NonNull + public int[] getTimes() { + return mTimes.getPrimitiveArray(); + } + + @Override + public String toString() { + return "size=" + getPointerSize() + " id=" + mPointerIds + " time=" + mTimes + + " x=" + mXCoordinates + " y=" + mYCoordinates; + } +} diff --git a/app/src/main/java/be/scri/latin/common/NativeSuggestOptions.java b/app/src/main/java/be/scri/latin/common/NativeSuggestOptions.java new file mode 100644 index 000000000..f84bb7f27 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/NativeSuggestOptions.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +public class NativeSuggestOptions { + // Need to update suggest_options.h when you add, remove or reorder options. + private static final int IS_GESTURE = 0; + private static final int USE_FULL_EDIT_DISTANCE = 1; + private static final int BLOCK_OFFENSIVE_WORDS = 2; + private static final int SPACE_AWARE_GESTURE_ENABLED = 3; + private static final int WEIGHT_FOR_LOCALE_IN_THOUSANDS = 4; + private static final int OPTIONS_SIZE = 5; + + private final int[] mOptions; + + public NativeSuggestOptions() { + mOptions = new int[OPTIONS_SIZE]; + } + + public void setIsGesture(final boolean value) { + setBooleanOption(IS_GESTURE, value); + } + + public void setIsSpaceAwareGesture(final boolean value) { + setBooleanOption(SPACE_AWARE_GESTURE_ENABLED, value); + } + + public void setUseFullEditDistance(final boolean value) { + setBooleanOption(USE_FULL_EDIT_DISTANCE, value); + } + + public void setBlockOffensiveWords(final boolean value) { + setBooleanOption(BLOCK_OFFENSIVE_WORDS, value); + } + + public void setWeightForLocale(final float value) { + // We're passing this option as a fixed point value, in thousands. This is decoded in + // native code by SuggestOptions#weightForLocale(). + setIntegerOption(WEIGHT_FOR_LOCALE_IN_THOUSANDS, (int) (value * 1000)); + } + + public int[] getOptions() { + return mOptions; + } + + private void setBooleanOption(final int key, final boolean value) { + mOptions[key] = value ? 1 : 0; + } + + private void setIntegerOption(final int key, final int value) { + mOptions[key] = value; + } +} diff --git a/app/src/main/java/be/scri/latin/common/ResizableIntArray.java b/app/src/main/java/be/scri/latin/common/ResizableIntArray.java new file mode 100644 index 000000000..5f90b13f3 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/ResizableIntArray.java @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +import androidx.annotation.NonNull; + +import java.util.Arrays; + +// TODO: This class is not thread-safe. +public final class ResizableIntArray { + @NonNull + private int[] mArray; + private int mLength; + + public ResizableIntArray(final int capacity) { + reset(capacity); + } + + public int get(final int index) { + if (index < mLength) { + return mArray[index]; + } + throw new ArrayIndexOutOfBoundsException("length=" + mLength + "; index=" + index); + } + + public void addAt(final int index, final int val) { + if (index < mLength) { + mArray[index] = val; + } else { + mLength = index; + add(val); + } + } + + public void add(final int val) { + final int currentLength = mLength; + ensureCapacity(currentLength + 1); + mArray[currentLength] = val; + mLength = currentLength + 1; + } + + /** + * Calculate the new capacity of {@code mArray}. + * @param minimumCapacity the minimum capacity that the {@code mArray} should have. + * @return the new capacity that the {@code mArray} should have. Returns zero when there is no + * need to expand {@code mArray}. + */ + private int calculateCapacity(final int minimumCapacity) { + final int currentCapcity = mArray.length; + if (currentCapcity < minimumCapacity) { + final int nextCapacity = currentCapcity * 2; + // The following is the same as return Math.max(minimumCapacity, nextCapacity); + return minimumCapacity > nextCapacity ? minimumCapacity : nextCapacity; + } + return 0; + } + + private void ensureCapacity(final int minimumCapacity) { + final int newCapacity = calculateCapacity(minimumCapacity); + if (newCapacity > 0) { + // TODO: Implement primitive array pool. + mArray = Arrays.copyOf(mArray, newCapacity); + } + } + + public int getLength() { + return mLength; + } + + public void setLength(final int newLength) { + ensureCapacity(newLength); + mLength = newLength; + } + + public void reset(final int capacity) { + // TODO: Implement primitive array pool. + mArray = new int[capacity]; + mLength = 0; + } + + @NonNull + public int[] getPrimitiveArray() { + return mArray; + } + + public void set(@NonNull final ResizableIntArray ip) { + // TODO: Implement primitive array pool. + mArray = ip.mArray; + mLength = ip.mLength; + } + + public void copy(@NonNull final ResizableIntArray ip) { + final int newCapacity = calculateCapacity(ip.mLength); + if (newCapacity > 0) { + // TODO: Implement primitive array pool. + mArray = new int[newCapacity]; + } + System.arraycopy(ip.mArray, 0, mArray, 0, ip.mLength); + mLength = ip.mLength; + } + + public void append(@NonNull final ResizableIntArray src, final int startPos, final int length) { + if (length == 0) { + return; + } + final int currentLength = mLength; + final int newLength = currentLength + length; + ensureCapacity(newLength); + System.arraycopy(src.mArray, startPos, mArray, currentLength, length); + mLength = newLength; + } + + public void fill(final int value, final int startPos, final int length) { + if (startPos < 0 || length < 0) { + throw new IllegalArgumentException("startPos=" + startPos + "; length=" + length); + } + final int endPos = startPos + length; + ensureCapacity(endPos); + Arrays.fill(mArray, startPos, endPos, value); + if (mLength < endPos) { + mLength = endPos; + } + } + + /** + * Shift to the left by elementCount, discarding elementCount pointers at the start. + * @param elementCount how many elements to shift. + */ + public void shift(final int elementCount) { + System.arraycopy(mArray, elementCount, mArray, 0, mLength - elementCount); + mLength -= elementCount; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < mLength; i++) { + if (i != 0) { + sb.append(","); + } + sb.append(mArray[i]); + } + return "[" + sb + "]"; + } +} diff --git a/app/src/main/java/be/scri/latin/common/StringUtils.java b/app/src/main/java/be/scri/latin/common/StringUtils.java new file mode 100644 index 000000000..ac38f08f0 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/StringUtils.java @@ -0,0 +1,536 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; + +import java.util.Arrays; +import java.util.Locale; + +public final class StringUtils { + + public static final int CAPITALIZE_NONE = 0; // No caps, or mixed case + public static final int CAPITALIZE_FIRST = 1; // First only + public static final int CAPITALIZE_ALL = 2; // All caps + + private static final char CHAR_LINE_FEED = 0X000A; + private static final char CHAR_VERTICAL_TAB = 0X000B; + private static final char CHAR_FORM_FEED = 0X000C; + private static final char CHAR_CARRIAGE_RETURN = 0X000D; + private static final char CHAR_NEXT_LINE = 0X0085; + private static final char CHAR_LINE_SEPARATOR = 0X2028; + private static final char CHAR_PARAGRAPH_SEPARATOR = 0X2029; + + private StringUtils() { + // This utility class is not publicly instantiable. + } + + // Taken from android.text.TextUtils. We are extensively using this method in many places, + // some of which don't have the android libraries available. + + /** + * Returns true if the string is null or 0-length. + * + * @param str the string to be examined + * @return true if str is null or zero length + */ + public static boolean isEmpty(@Nullable final CharSequence str) { + return (str == null || str.length() == 0); + } + + public static int codePointCount(@Nullable final CharSequence text) { + if (isEmpty(text)) { + return 0; + } + return Character.codePointCount(text, 0, text.length()); + } + + @NonNull + public static String newSingleCodePointString(final int codePoint) { + if (Character.charCount(codePoint) == 1) { + // Optimization: avoid creating a temporary array for characters that are + // represented by a single char value + return String.valueOf((char) codePoint); + } + // For surrogate pair + return new String(Character.toChars(codePoint)); + } + + @NonNull + public static String capitalizeFirstCodePoint(@NonNull final String s, + @NonNull final Locale locale) { + if (s.length() <= 1) { + return s.toUpperCase(getLocaleUsedForToTitleCase(locale)); + } + // Please refer to the comment below in + // {@link #capitalizeFirstAndDowncaseRest(String,Locale)} as this has the same shortcomings + final int cutoff = s.offsetByCodePoints(0, 1); + return s.substring(0, cutoff).toUpperCase(getLocaleUsedForToTitleCase(locale)) + + s.substring(cutoff); + } + + @NonNull + public static String capitalizeFirstAndDowncaseRest(@NonNull final String s, + @NonNull final Locale locale) { + if (s.length() <= 1) { + return s.toUpperCase(getLocaleUsedForToTitleCase(locale)); + } + // TODO: fix the bugs below + // - It does not work for Serbian, because it fails to account for the "lj" character, + // which should be "Lj" in title case and "LJ" in upper case. + // - It does not work for Dutch, because it fails to account for the "ij" digraph when it's + // written as two separate code points. They are two different characters but both should + // be capitalized as "IJ" as if they were a single letter in most words (not all). If the + // unicode char for the ligature is used however, it works. + final int cutoff = s.offsetByCodePoints(0, 1); + return s.substring(0, cutoff).toUpperCase(getLocaleUsedForToTitleCase(locale)) + + s.substring(cutoff).toLowerCase(locale); + } + + @NonNull + public static int[] toCodePointArray(@NonNull final CharSequence charSequence) { + return toCodePointArray(charSequence, 0, charSequence.length()); + } + + @NonNull + private static final int[] EMPTY_CODEPOINTS = {}; + + /** + * Converts a range of a string to an array of code points. + * + * @param charSequence the source string. + * @param startIndex the start index inside the string in java chars, inclusive. + * @param endIndex the end index inside the string in java chars, exclusive. + * @return a new array of code points. At most endIndex - startIndex, but possibly less. + */ + @NonNull + public static int[] toCodePointArray(@NonNull final CharSequence charSequence, + final int startIndex, final int endIndex) { + final int length = charSequence.length(); + if (length <= 0) { + return EMPTY_CODEPOINTS; + } + final int[] codePoints = + new int[Character.codePointCount(charSequence, startIndex, endIndex)]; + copyCodePointsAndReturnCodePointCount(codePoints, charSequence, startIndex, endIndex, + false /* downCase */); + return codePoints; + } + + /** + * Copies the codepoints in a CharSequence to an int array. + *

+ * This method assumes there is enough space in the array to store the code points. The size + * can be measured with Character#codePointCount(CharSequence, int, int) before passing to this + * method. If the int array is too small, an ArrayIndexOutOfBoundsException will be thrown. + * Also, this method makes no effort to be thread-safe. Do not modify the CharSequence while + * this method is running, or the behavior is undefined. + * This method can optionally downcase code points before copying them, but it pays no attention + * to locale while doing so. + * + * @param destination the int array. + * @param charSequence the CharSequence. + * @param startIndex the start index inside the string in java chars, inclusive. + * @param endIndex the end index inside the string in java chars, exclusive. + * @param downCase if this is true, code points will be downcased before being copied. + * @return the number of copied code points. + */ + public static int copyCodePointsAndReturnCodePointCount(@NonNull final int[] destination, + @NonNull final CharSequence charSequence, final int startIndex, final int endIndex, + final boolean downCase) { + int destIndex = 0; + for (int index = startIndex; index < endIndex; + index = Character.offsetByCodePoints(charSequence, index, 1)) { + final int codePoint = Character.codePointAt(charSequence, index); + // TODO: stop using this, as it's not aware of the locale and does not always do + // the right thing. + destination[destIndex] = downCase ? Character.toLowerCase(codePoint) : codePoint; + destIndex++; + } + return destIndex; + } + + @NonNull + public static int[] toSortedCodePointArray(@NonNull final String string) { + final int[] codePoints = toCodePointArray(string); + Arrays.sort(codePoints); + return codePoints; + } + + /** + * Construct a String from a code point array + * + * @param codePoints a code point array that is null terminated when its logical length is + * shorter than the array length. + * @return a string constructed from the code point array. + */ + @NonNull + public static String getStringFromNullTerminatedCodePointArray( + @NonNull final int[] codePoints) { + int stringLength = codePoints.length; + for (int i = 0; i < codePoints.length; i++) { + if (codePoints[i] == 0) { + stringLength = i; + break; + } + } + return new String(codePoints, 0 /* offset */, stringLength); + } + + // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE. + public static int getCapitalizationType(@NonNull final String text) { + // If the first char is not uppercase, then the word is either all lower case or + // camel case, and in either case we return CAPITALIZE_NONE. + final int len = text.length(); + int index = 0; + for (; index < len; index = text.offsetByCodePoints(index, 1)) { + if (Character.isLetter(text.codePointAt(index))) { + break; + } + } + if (index == len) return CAPITALIZE_NONE; + if (!Character.isUpperCase(text.codePointAt(index))) { + return CAPITALIZE_NONE; + } + int capsCount = 1; + int letterCount = 1; + for (index = text.offsetByCodePoints(index, 1); index < len; + index = text.offsetByCodePoints(index, 1)) { + if (1 != capsCount && letterCount != capsCount) break; + final int codePoint = text.codePointAt(index); + if (Character.isUpperCase(codePoint)) { + ++capsCount; + ++letterCount; + } else if (Character.isLetter(codePoint)) { + // We need to discount non-letters since they may not be upper-case, but may + // still be part of a word (e.g. single quote or dash, as in "IT'S" or "FULL-TIME") + ++letterCount; + } + } + // We know the first char is upper case. So we want to test if either every letter other + // than the first is lower case, or if they are all upper case. If the string is exactly + // one char long, then we will arrive here with letterCount 1, and this is correct, too. + if (1 == capsCount) return CAPITALIZE_FIRST; + return (letterCount == capsCount ? CAPITALIZE_ALL : CAPITALIZE_NONE); + } + + public static boolean isIdenticalAfterUpcase(@NonNull final String text) { + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = text.codePointAt(i); + if (Character.isLetter(codePoint) && !Character.isUpperCase(codePoint)) { + return false; + } + i += Character.charCount(codePoint); + } + return true; + } + + public static boolean isIdenticalAfterDowncase(@NonNull final String text) { + final int length = text.length(); + int i = 0; + while (i < length) { + final int codePoint = text.codePointAt(i); + if (Character.isLetter(codePoint) && !Character.isLowerCase(codePoint)) { + return false; + } + i += Character.charCount(codePoint); + } + return true; + } + + public static boolean isIdenticalAfterCapitalizeEachWord(@NonNull final String text, + @NonNull final int[] sortedSeparators) { + boolean needsCapsNext = true; + final int len = text.length(); + for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { + final int codePoint = text.codePointAt(i); + if (Character.isLetter(codePoint)) { + if ((needsCapsNext && !Character.isUpperCase(codePoint)) + || (!needsCapsNext && !Character.isLowerCase(codePoint))) { + return false; + } + } + // We need a capital letter next if this is a separator. + needsCapsNext = (Arrays.binarySearch(sortedSeparators, codePoint) >= 0); + } + return true; + } + + // TODO: like capitalizeFirst*, this does not work perfectly for Dutch because of the IJ digraph + // which should be capitalized together in *some* cases. + @NonNull + public static String capitalizeEachWord(@NonNull final String text, + @NonNull final int[] sortedSeparators, @NonNull final Locale locale) { + final StringBuilder builder = new StringBuilder(); + boolean needsCapsNext = true; + final int len = text.length(); + for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { + final String nextChar = text.substring(i, text.offsetByCodePoints(i, 1)); + if (needsCapsNext) { + builder.append(nextChar.toUpperCase(locale)); + } else { + builder.append(nextChar.toLowerCase(locale)); + } + // We need a capital letter next if this is a separator. + needsCapsNext = (Arrays.binarySearch(sortedSeparators, nextChar.codePointAt(0)) >= 0); + } + return builder.toString(); + } + + /** + * Approximates whether the text before the cursor looks like a URL. + *

+ * This is not foolproof, but it should work well in the practice. + * Essentially it walks backward from the cursor until it finds something that's not a letter, + * digit, or common URL symbol like underscore. If it hasn't found a period yet, then it + * does not look like a URL. + * If the text: + * - starts with www and contains a period + * - starts with a slash preceded by either a slash, whitespace, or start-of-string + * Then it looks like a URL and we return true. Otherwise, we return false. + *

+ * Note: this method is called quite often, and should be fast. + *

+ * TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the + * code complexity, but ideally it should not. It's acceptable for now. + */ + public static boolean lastPartLooksLikeURL(@NonNull final CharSequence text) { + int i = text.length(); + if (0 == i) { + return false; + } + int wCount = 0; + int slashCount = 0; + boolean hasSlash = false; + boolean hasPeriod = false; + int codePoint = 0; + while (i > 0) { + codePoint = Character.codePointBefore(text, i); + if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') { + // Handwavy heuristic to see if that's a URL character. Anything between period + // and z. This includes all lower- and upper-case ascii letters, period, + // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation + // marks, double quotes... + // Anything that's not a URL-like character causes us to break from here and + // evaluate normally. + break; + } + if (Constants.CODE_PERIOD == codePoint) { + hasPeriod = true; + } + if (Constants.CODE_SLASH == codePoint) { + hasSlash = true; + if (2 == ++slashCount) { + return true; + } + } else { + slashCount = 0; + } + if ('w' == codePoint) { + ++wCount; + } else { + wCount = 0; + } + i = Character.offsetByCodePoints(text, i, -1); + } + // End of the text run. + // If it starts with www and includes a period, then it looks like a URL. + if (wCount >= 3 && hasPeriod) { + return true; + } + // If it starts with a slash, and the code point before is whitespace, it looks like an URL. + if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) { + return true; + } + // If it has both a period and a slash, it looks like an URL. + return hasPeriod && hasSlash; + // Otherwise, it doesn't look like an URL. + } + + /** + * Examines the string and returns whether we're inside a double quote. + *

+ * This is used to decide whether we should put an automatic space before or after a double + * quote character. If we're inside a quotation, then we want to close it, so we want a space + * after and not before. Otherwise, we want to open the quotation, so we want a space before + * and not after. Exception: after a digit, we never want a space because the "inch" or + * "minutes" use cases is dominant after digits. + * In the practice, we determine whether we are in a quotation or not by finding the previous + * double quote character, and looking at whether it's followed by whitespace. If so, that + * was a closing quotation mark, so we're not inside a double quote. If it's not followed + * by whitespace, then it was an opening quotation mark, and we're inside a quotation. + * However, on the way to the double quote we can determine, some double quotes might be + * ignored, e.g. because they are followed by punctuation. These double quotes are counted and + * taken into account. + * + * @param text the text to examine. + * @return whether we're inside a double quote. + */ + public static boolean isInsideDoubleQuoteOrAfterDigit(@NonNull final CharSequence text) { + int i = text.length(); + if (0 == i) { + return false; + } + int codePoint = Character.codePointBefore(text, i); + if (Character.isDigit(codePoint)) { + return true; + } + int prevCodePoint = 0; + int ignoredDoubleQuoteCount = 0; + while (i > 0) { + codePoint = Character.codePointBefore(text, i); + if (Constants.CODE_DOUBLE_QUOTE == codePoint) { + // If we see a double quote followed by whitespace, then that + // was a closing quote. + if (Character.isWhitespace(prevCodePoint)) { + return ignoredDoubleQuoteCount % 2 == 1; + } + } + if (Character.isWhitespace(codePoint) && Constants.CODE_DOUBLE_QUOTE == prevCodePoint) { + // If we see a double quote preceded by whitespace, then that + // was an opening quote. No need to continue seeking. + return ignoredDoubleQuoteCount % 2 == 0; + } + if (Constants.CODE_DOUBLE_QUOTE == prevCodePoint) { + ignoredDoubleQuoteCount++; + } + i -= Character.charCount(codePoint); + prevCodePoint = codePoint; + } + // We reached the start of text. If the first char is a double quote, then we're inside + // a double quote. Otherwise we're not. + if (ignoredDoubleQuoteCount % 2 == 0) + return Constants.CODE_DOUBLE_QUOTE == codePoint; + else + return Constants.CODE_DOUBLE_QUOTE != codePoint; + } + + public static boolean isEmptyStringOrWhiteSpaces(@NonNull final String s) { + final int N = codePointCount(s); + for (int i = 0; i < N; ++i) { + if (!Character.isWhitespace(s.codePointAt(i))) { + return false; + } + } + return true; + } + + private static final String LANGUAGE_GREEK = "el"; + + @NonNull + private static Locale getLocaleUsedForToTitleCase(@NonNull final Locale locale) { + // In Greek locale {@link String#toUpperCase(Locale)} eliminates accents from its result. + // In order to get accented upper case letter, {@link Locale#ROOT} should be used. + if (LANGUAGE_GREEK.equals(locale.getLanguage())) { + return Locale.ROOT; + } + return locale; + } + + private static boolean scriptSupportsUppercase(final Locale locale) { + final String lang = locale.getLanguage(); + return !lang.equals("ar") && !lang.equals("he") && !lang.equals("iw") && !lang.equals("hi") && !lang.equals("ta") && !lang.equals("ml"); + } + + @Nullable + public static String toTitleCaseOfKeyLabel(@Nullable final String label, + @NonNull final Locale locale) { + if (label == null || !scriptSupportsUppercase(locale)) { + return label; + } + if (label.equals("ß")) + return "ẞ"; // upcasing of standalone ß, SS is not useful as s is on the keyboard anyway + + return label.toUpperCase(getLocaleUsedForToTitleCase(locale)); + } + + public static int toTitleCaseOfKeyCode(final int code, @NonNull final Locale locale) { + if (!Constants.isLetterCode(code)) { + return code; + } + final String label = newSingleCodePointString(code); + final String titleCaseLabel = toTitleCaseOfKeyLabel(label, locale); + return codePointCount(titleCaseLabel) == 1 + ? titleCaseLabel.codePointAt(0) : Constants.KeyCode.NOT_SPECIFIED; + } + + public static int getTrailingSingleQuotesCount(@NonNull final CharSequence charSequence) { + final int lastIndex = charSequence.length() - 1; + int i = lastIndex; + while (i >= 0 && charSequence.charAt(i) == Constants.CODE_SINGLE_QUOTE) { + --i; + } + return lastIndex - i; + } + + /** + * Returns whether the last composed word contains line-breaking character (e.g. CR or LF). + * + * @param text the text to be examined. + * @return {@code true} if the last composed word contains line-breaking separator. + */ + public static boolean hasLineBreakCharacter(@Nullable final String text) { + if (isEmpty(text)) { + return false; + } + for (int i = text.length() - 1; i >= 0; --i) { + final char c = text.charAt(i); + switch (c) { + case CHAR_LINE_FEED: + case CHAR_VERTICAL_TAB: + case CHAR_FORM_FEED: + case CHAR_CARRIAGE_RETURN: + case CHAR_NEXT_LINE: + case CHAR_LINE_SEPARATOR: + case CHAR_PARAGRAPH_SEPARATOR: + return true; + } + } + return false; + } + + // unicode blocks that contain emojis + // very fast check, but there are very few blocks that exclusively contain emojis, + public static boolean mightBeEmoji(final int c) { + return (0x200D <= c && c <= 0x2BFF) // unicode blocks from General Punctuation to Miscellaneous Symbols and Arrows + || (0x1F104 <= c && c <= 0x1FAFF) // unicode blocks from Mahjong Tiles to Symbols and Pictographs Extended-A + || (0xE0000 <= c && c <= 0xE007F) // unicode block Tags + || c == 0xFE0F; // variation selector emoji with color + } + + public static boolean isLowerCaseAscii(final String s) { + final int length = s.length(); + for (int i = 0; i < length; i++) { + final int c = s.charAt(i); + if (c < 97 || c > 122) return false; + } + return true; + } + + public static int charIndexOfFirstWhitespace(final CharSequence s) { + for (int i = 0; i < s.length() - 1; i++) { + final char c = s.charAt(i); + if (Character.isWhitespace(c)) { + return i + 1; + } + } + return -1; + } + + public static int charIndexOfLastWhitespace(final CharSequence s) { + for (int i = s.length() - 1; i >= 0; i--) { + final char c = s.charAt(i); + if (Character.isWhitespace(c)) { + return i + 1; + } + } + return -1; + } +} diff --git a/app/src/main/java/be/scri/latin/common/StringUtils.kt b/app/src/main/java/be/scri/latin/common/StringUtils.kt new file mode 100644 index 000000000..b7d1335d5 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/StringUtils.kt @@ -0,0 +1,59 @@ +@file:Suppress("ktlint", "detekt.all") + +/* + * Copyright (C) 2013 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common + +import java.util.Locale + +fun CharSequence.codePointAt(offset: Int) = Character.codePointAt(this, offset) +fun CharSequence.codePointBefore(offset: Int) = Character.codePointBefore(this, offset) + +/** Loops over the codepoints in [text]. Exits when [loop] returns true */ +inline fun loopOverCodePoints(text: CharSequence, loop: (cp: Int, charCount: Int) -> Boolean) { + var offset = 0 + while (offset < text.length) { + val cp = text.codePointAt(offset) + val charCount = Character.charCount(cp) + if (loop(cp, charCount)) return + offset += charCount + } +} + +/** Loops backwards over the codepoints in [text]. Exits when [loop] returns true */ +inline fun loopOverCodePointsBackwards(text: CharSequence, loop: (cp: Int, charCount: Int) -> Boolean) { + var offset = text.length + while (offset > 0) { + val cp = text.codePointBefore(offset) + val charCount = Character.charCount(cp) + if (loop(cp, charCount)) return + offset -= charCount + } +} + +fun isEmoji(c: Int): Boolean = mightBeEmoji(c) && isEmoji(StringUtils.newSingleCodePointString(c)) + +fun isEmoji(text: CharSequence): Boolean = mightBeEmoji(text) && text.matches(emoRegex) + +fun mightBeEmoji(text: CharSequence): Boolean { + loopOverCodePoints(text) { cp, _ -> + if (mightBeEmoji(cp)) return true + false + } + return false +} + +fun mightBeEmoji(codePoint: Int): Boolean { + return StringUtils.mightBeEmoji(codePoint) +} + +fun String.decapitalize(locale: Locale): String { + if (isEmpty() || !this[0].isUpperCase()) return this + return replaceFirstChar { it.lowercase(locale) } +} + +private val emoRegex = Regex("[#*0-9]\\uFE0F?\\u20E3|[\\xA9\\xAE\\u203C\\u2049\\u2122\\u2139\\u2194-\\u2199\\u21A9\\u21AA\\u231A\\u231B\\u2328\\u23CF\\u23ED-\\u23EF\\u23F1\\u23F2\\u23F8-\\u23FA\\u24C2\\u25AA\\u25AB\\u25B6\\u25C0\\u25FB\\u25FC\\u25FE\\u2600-\\u2604\\u260E\\u2611\\u2614\\u2615\\u2618\\u2620\\u2622\\u2623\\u2626\\u262A\\u262E\\u262F\\u2638-\\u263A\\u2640\\u2642\\u2648-\\u2653\\u265F\\u2660\\u2663\\u2665\\u2666\\u2668\\u267B\\u267E\\u267F\\u2692\\u2694-\\u2697\\u2699\\u269B\\u269C\\u26A0\\u26A7\\u26AA\\u26B0\\u26B1\\u26BD\\u26BE\\u26C4\\u26C8\\u26CF\\u26D1\\u26E9\\u26F0-\\u26F5\\u26F7\\u26F8\\u26FA\\u2702\\u2708\\u2709\\u270F\\u2712\\u2714\\u2716\\u271D\\u2721\\u2733\\u2734\\u2744\\u2747\\u2757\\u2763\\u27A1\\u2934\\u2935\\u2B05-\\u2B07\\u2B1B\\u2B1C\\u2B55\\u3030\\u303D\\u3297\\u3299]") \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/common/UnicodeSurrogate.java b/app/src/main/java/be/scri/latin/common/UnicodeSurrogate.java new file mode 100644 index 000000000..cef4e9367 --- /dev/null +++ b/app/src/main/java/be/scri/latin/common/UnicodeSurrogate.java @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2015 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.common; + +/** + * Emojis are supplementary characters expressed as a low+high pair. For instance, + * the emoji U+1F625 is encoded as "\uD83D\uDE25" in UTF-16, where '\uD83D' is in + * the range of [0xd800, 0xdbff] and '\uDE25' is in the range of [0xdc00, 0xdfff]. + * {@see http://docs.oracle.com/javase/6/docs/api/java/lang/Character.html#unicode} + */ +public final class UnicodeSurrogate { + private static final char LOW_SURROGATE_MIN = '\uD800'; + private static final char LOW_SURROGATE_MAX = '\uDBFF'; + private static final char HIGH_SURROGATE_MIN = '\uDC00'; + private static final char HIGH_SURROGATE_MAX = '\uDFFF'; + + public static boolean isLowSurrogate(final char c) { + return c >= LOW_SURROGATE_MIN && c <= LOW_SURROGATE_MAX; + } + + public static boolean isHighSurrogate(final char c) { + return c >= HIGH_SURROGATE_MIN && c <= HIGH_SURROGATE_MAX; + } +} diff --git a/app/src/main/java/be/scri/latin/define/DebugFlags.kt b/app/src/main/java/be/scri/latin/define/DebugFlags.kt new file mode 100644 index 000000000..091b4a85a --- /dev/null +++ b/app/src/main/java/be/scri/latin/define/DebugFlags.kt @@ -0,0 +1,20 @@ +@file:Suppress("ktlint", "detekt.all") + +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.define + +import android.content.Context + +object DebugFlags { + @JvmField + var DEBUG_ENABLED = false + + fun init(context: Context) { + // No-op for Scribe-Android + } +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/define/DecoderSpecificConstants.kt b/app/src/main/java/be/scri/latin/define/DecoderSpecificConstants.kt new file mode 100644 index 000000000..e4ede4c02 --- /dev/null +++ b/app/src/main/java/be/scri/latin/define/DecoderSpecificConstants.kt @@ -0,0 +1,27 @@ +@file:Suppress("ktlint", "detekt.all") + +/* + * Copyright (C) 2015 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.define + +/** + * Decoder specific constants for LatinIme. + */ +object DecoderSpecificConstants { + // Must be equal to MAX_WORD_LENGTH in native/jni/src/defines.h + const val DICTIONARY_MAX_WORD_LENGTH = 48 + + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram is supported in Java side. Needs to modify + // MAX_PREV_WORD_COUNT_FOR_N_GRAM in native/jni/src/defines.h for suggestions. + const val MAX_PREV_WORD_COUNT_FOR_N_GRAM = 3 + const val DECODER_DICT_SUFFIX = "" + const val SHOULD_VERIFY_MAGIC_NUMBER = true + const val SHOULD_VERIFY_CHECKSUM = true + const val SHOULD_USE_DICT_VERSION = true + const val SHOULD_AUTO_CORRECT_USING_NON_WHITE_LISTED_SUGGESTION = false + const val SHOULD_REMOVE_PREVIOUSLY_REJECTED_SUGGESTION = true +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/define/ProductionFlags.kt b/app/src/main/java/be/scri/latin/define/ProductionFlags.kt new file mode 100644 index 000000000..09464c04d --- /dev/null +++ b/app/src/main/java/be/scri/latin/define/ProductionFlags.kt @@ -0,0 +1,25 @@ +@file:Suppress("ktlint", "detekt.all") + +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.define + +object ProductionFlags { + // supporting hardware keyboard still has a bunch of issues + // crash https://github.com/Helium314/HeliBoard/issues/2047 (possibly fixed with b7cb95fc9da213c99d82e8833fb5f950f39d232e) + // different crash https://github.com/Helium314/HeliBoard/issues/2001 + // LatinIME.isInputViewShown() returns true when there is no input view, thus crashing in onUpdateSelection + // physical layout ignored https://github.com/Helium314/HeliBoard/issues/1957, https://github.com/Helium314/HeliBoard/issues/1949 + // physical layout ignored for uppercase letters only (?) https://github.com/Helium314/HeliBoard/issues/2030 + const val IS_HARDWARE_KEYBOARD_SUPPORTED = false + + /** + * Include all suggestions from all dictionaries in + * [be.scri.latin.SuggestedWords.mRawSuggestions]. + */ + const val INCLUDE_RAW_SUGGESTIONS = false +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/dictionary/Dictionary.java b/app/src/main/java/be/scri/latin/dictionary/Dictionary.java new file mode 100644 index 000000000..51d36e08a --- /dev/null +++ b/app/src/main/java/be/scri/latin/dictionary/Dictionary.java @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.dictionary; + +import java.util.ArrayList; +import java.util.Locale; + +import be.scri.latin.NgramContext; +import be.scri.latin.SuggestedWords.SuggestedWordInfo; +import be.scri.latin.common.ComposedData; +import be.scri.latin.makedict.WordProperty; +import be.scri.latin.settings.SettingsValuesForSuggestion; + +/** + * Abstract base class for a dictionary that can do a fuzzy search for words based on a set of key + * strokes. + */ +public abstract class Dictionary { + public static final int NOT_A_PROBABILITY = -1; + public static final float NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL = -1.0f; + + // The following types do not actually come from real dictionary instances, so we create + // corresponding instances. + public static final String TYPE_USER_TYPED = "user_typed"; + public static final PhonyDictionary DICTIONARY_USER_TYPED = new PhonyDictionary(TYPE_USER_TYPED); + + public static final String TYPE_USER_SHORTCUT = "user_shortcut"; + public static final PhonyDictionary DICTIONARY_USER_SHORTCUT = + new PhonyDictionary(TYPE_USER_SHORTCUT); + + public static final String TYPE_APPLICATION_DEFINED = "application_defined"; + public static final PhonyDictionary DICTIONARY_APPLICATION_DEFINED = + new PhonyDictionary(TYPE_APPLICATION_DEFINED); + + public static final String TYPE_HARDCODED = "hardcoded"; // punctuation signs and such + public static final PhonyDictionary DICTIONARY_HARDCODED = + new PhonyDictionary(TYPE_HARDCODED); + + // Spawned by resuming suggestions. Comes from a span that was in the TextView. + public static final String TYPE_RESUMED = "resumed"; + public static final PhonyDictionary DICTIONARY_RESUMED = new PhonyDictionary(TYPE_RESUMED); + + // The following types of dictionary have actual functional instances. We don't need final + // phony dictionary instances for them. + public static final String TYPE_MAIN = "main"; + public static final String TYPE_CONTACTS = "contacts"; + public static final String TYPE_APPS = "apps"; + // User dictionary, the system-managed one. + public static final String TYPE_USER = "user"; + // User history dictionary internal to LatinIME. + public static final String TYPE_USER_HISTORY = "history"; + public static final String TYPE_EMOJI = "emoji"; + public final String mDictType; + // The locale for this dictionary. May be null if unknown (phony dictionary for example). + public final Locale mLocale; + + public Dictionary(final String dictType, final Locale locale) { + mDictType = dictType; + mLocale = locale; + } + + /** + * Searches for suggestions for a given context. + * @param composedData the key sequence to match with coordinate info + * @param ngramContext the context for n-gram. + * @param proximityInfoHandle the handle for key proximity. Is ignored by some implementations. + * @param settingsValuesForSuggestion the settings values used for the suggestion. + * @param sessionId the session id. + * @param weightForLocale the weight given to this locale, to multiply the output scores for + * multilingual input. + * @param inOutWeightOfLangModelVsSpatialModel the weight of the language model as a ratio of + * the spatial model, used for generating suggestions. inOutWeightOfLangModelVsSpatialModel is + * a float array that has only one element. This can be updated when a different value is used. + * @return the list of suggestions (possibly null if none) + */ + abstract public ArrayList getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, + final SettingsValuesForSuggestion settingsValuesForSuggestion, + final int sessionId, final float weightForLocale, + final float[] inOutWeightOfLangModelVsSpatialModel); + + /** + * Checks if the given word has to be treated as a valid word. Please note that some + * dictionaries have entries that should be treated as invalid words. + * @param word the word to search for. The search should be case-insensitive. + * @return true if the word is valid, false otherwise + */ + public boolean isValidWord(final String word) { + return isInDictionary(word); + } + + /** + * Checks if the given word is in the dictionary regardless of it being valid or not. + */ + abstract public boolean isInDictionary(final String word); + + /** + * Get the frequency of the word. + * @param word the word to get the frequency of. + */ + public int getFrequency(final String word) { + return NOT_A_PROBABILITY; + } + + /** + * Get the maximum frequency of the word. + * @param word the word to get the maximum frequency of. + */ + public int getMaxFrequencyOfExactMatches(final String word) { + return NOT_A_PROBABILITY; + } + + /** + * Compares the contents of the character array with the typed word and returns true if they + * are the same. + * @param word the array of characters that make up the word + * @param length the number of valid characters in the character array + * @param typedWord the word to compare with + * @return true if they are the same, false otherwise. + */ + protected boolean same(final char[] word, final int length, final String typedWord) { + if (typedWord.length() != length) { + return false; + } + for (int i = 0; i < length; i++) { + if (word[i] != typedWord.charAt(i)) { + return false; + } + } + return true; + } + + /** + * Override to clean up any resources. + */ + public void close() { + // empty base implementation + } + + public void onFinishInput() { + //empty base implementation + } + + /** + * Subclasses may override to indicate that this Dictionary is not yet properly initialized. + */ + public boolean isInitialized() { + return true; + } + + /** + * Whether we think this suggestion should trigger an auto-commit. prevWord is the word + * before the suggestion, so that we can use n-gram frequencies. + * @param candidate The candidate suggestion, in whole (not only the first part). + * @return whether we should auto-commit or not. + */ + public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { + // If we don't have support for auto-commit, or if we don't know, we return false to + // avoid auto-committing stuff. Implementations of the Dictionary class that know to + // determine whether we should auto-commit will override this. + return false; + } + + /** + * Whether this dictionary is based on data specific to the user, e.g., the user's contacts. + * @return Whether this dictionary is specific to the user. + */ + public boolean isUserSpecific() { + return switch (mDictType) { + case TYPE_USER_TYPED, + TYPE_USER, + TYPE_CONTACTS, + TYPE_APPS, + TYPE_USER_HISTORY -> true; + default -> false; + }; + } + + public WordProperty getWordProperty(final String word, final boolean isBeginningOfSentence) { + return null; + } + + /** + * Not a true dictionary. A placeholder used to indicate suggestions that don't come from any + * real dictionary. + */ + public static class PhonyDictionary extends Dictionary { + PhonyDictionary(final String type) { + super(type, null); + } + + @Override + public ArrayList getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, + final SettingsValuesForSuggestion settingsValuesForSuggestion, + final int sessionId, final float weightForLocale, + final float[] inOutWeightOfLangModelVsSpatialModel) { + return null; + } + + @Override + public boolean isInDictionary(String word) { + return false; + } + } +} diff --git a/app/src/main/java/be/scri/latin/dictionary/DictionaryStats.java b/app/src/main/java/be/scri/latin/dictionary/DictionaryStats.java new file mode 100644 index 000000000..493516a14 --- /dev/null +++ b/app/src/main/java/be/scri/latin/dictionary/DictionaryStats.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.dictionary; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; + +import java.io.File; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.Locale; + +public class DictionaryStats { + public final Locale mLocale; + public final String mDictType; + public final String mDictFileName; + public final long mDictFileSize; + public final int mContentVersion; + public final int mWordCount; + + public DictionaryStats( + @NonNull final Locale locale, + @NonNull final String dictType, + @Nullable final String dictFileName, + @Nullable final File dictFile, + final int contentVersion) { + mLocale = locale; + mDictType = dictType; + mDictFileSize = (dictFile == null || !dictFile.exists()) ? 0 : dictFile.length(); + mDictFileName = dictFileName; + mContentVersion = contentVersion; + mWordCount = -1; + } + + public DictionaryStats( + @NonNull final Locale locale, + @NonNull final String dictType, + final int wordCount) { + mLocale = locale; + mDictType = dictType; + mDictFileSize = wordCount; + mDictFileName = null; + mContentVersion = 0; + mWordCount = wordCount; + } + + public String getFileSizeString() { + BigDecimal bytes = new BigDecimal(mDictFileSize); + BigDecimal kb = bytes.divide(new BigDecimal(1024), 2, RoundingMode.HALF_UP); + if (kb.longValue() == 0) { + return bytes + " bytes"; + } + BigDecimal mb = kb.divide(new BigDecimal(1024), 2, RoundingMode.HALF_UP); + if (mb.longValue() == 0) { + return kb + " kb"; + } + return mb + " Mb"; + } + + @Override + public String toString() { + final StringBuilder builder = new StringBuilder(mDictType); + if (mDictType.equals(Dictionary.TYPE_MAIN)) { + builder.append(" ("); + builder.append(mContentVersion); + builder.append(")"); + } + builder.append(": "); + if (mWordCount > -1) { + builder.append(mWordCount); + builder.append(" words"); + } else { + builder.append(mDictFileName); + builder.append(" / "); + builder.append(getFileSizeString()); + } + return builder.toString(); + } + + public static String toString(final Iterable stats) { + final StringBuilder builder = new StringBuilder("LM Stats"); + for (DictionaryStats stat : stats) { + builder.append("\n "); + builder.append(stat.toString()); + } + return builder.toString(); + } +} diff --git a/app/src/main/java/be/scri/latin/dictionary/ReadOnlyBinaryDictionary.java b/app/src/main/java/be/scri/latin/dictionary/ReadOnlyBinaryDictionary.java new file mode 100644 index 000000000..15985577b --- /dev/null +++ b/app/src/main/java/be/scri/latin/dictionary/ReadOnlyBinaryDictionary.java @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.dictionary; + +import be.scri.inputmethod.latin.BinaryDictionary; + +import be.scri.latin.NgramContext; +import be.scri.latin.SuggestedWords.SuggestedWordInfo; +import be.scri.latin.common.ComposedData; +import be.scri.latin.makedict.WordProperty; +import be.scri.latin.settings.SettingsValuesForSuggestion; + +import java.util.ArrayList; +import java.util.Locale; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +/** + * This class provides binary dictionary reading operations with locking. An instance of this class + * can be used by multiple threads. Note that different session IDs must be used when multiple + * threads get suggestions using this class. + */ +public final class ReadOnlyBinaryDictionary extends Dictionary { + /** + * A lock for accessing binary dictionary. Only closing binary dictionary is the operation + * that change the state of dictionary. + */ + private final ReentrantReadWriteLock mLock = new ReentrantReadWriteLock(); + + private final BinaryDictionary mBinaryDictionary; + + public ReadOnlyBinaryDictionary(final String filename, final long offset, final long length, + final boolean useFullEditDistance, final Locale locale, final String dictType) { + super(dictType, locale); + mBinaryDictionary = new BinaryDictionary(filename, offset, length, useFullEditDistance, + locale, dictType, false /* isUpdatable */); + } + + public boolean isValidDictionary() { + return mBinaryDictionary.isValidDictionary(); + } + + @Override + public ArrayList getSuggestions(final ComposedData composedData, + final NgramContext ngramContext, final long proximityInfoHandle, + final SettingsValuesForSuggestion settingsValuesForSuggestion, + final int sessionId, final float weightForLocale, + final float[] inOutWeightOfLangModelVsSpatialModel) { + if (mLock.readLock().tryLock()) { + try { + return mBinaryDictionary.getSuggestions(composedData, ngramContext, + proximityInfoHandle, settingsValuesForSuggestion, sessionId, + weightForLocale, inOutWeightOfLangModelVsSpatialModel); + } finally { + mLock.readLock().unlock(); + } + } + return null; + } + + @Override + public boolean isInDictionary(final String word) { + if (mLock.readLock().tryLock()) { + try { + return mBinaryDictionary.isInDictionary(word); + } finally { + mLock.readLock().unlock(); + } + } + return false; + } + + @Override + public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { + if (mLock.readLock().tryLock()) { + try { + return mBinaryDictionary.shouldAutoCommit(candidate); + } finally { + mLock.readLock().unlock(); + } + } + return false; + } + + @Override + public int getFrequency(final String word) { + if (mLock.readLock().tryLock()) { + try { + return mBinaryDictionary.getFrequency(word); + } finally { + mLock.readLock().unlock(); + } + } + return NOT_A_PROBABILITY; + } + + @Override + public int getMaxFrequencyOfExactMatches(final String word) { + if (mLock.readLock().tryLock()) { + try { + return mBinaryDictionary.getMaxFrequencyOfExactMatches(word); + } finally { + mLock.readLock().unlock(); + } + } + return NOT_A_PROBABILITY; + } + + @Override + public WordProperty getWordProperty(String word, boolean isBeginningOfSentence) { + if (mLock.readLock().tryLock()) { + try { + return mBinaryDictionary.getWordProperty(word, isBeginningOfSentence); + } finally { + mLock.readLock().unlock(); + } + } + return null; + } + + @Override + public void close() { + mLock.writeLock().lock(); + try { + mBinaryDictionary.close(); + } finally { + mLock.writeLock().unlock(); + } + } + + public String getHash() { + return mBinaryDictionary.getHash(); + } +} diff --git a/app/src/main/java/be/scri/latin/makedict/DictionaryHeader.kt b/app/src/main/java/be/scri/latin/makedict/DictionaryHeader.kt new file mode 100644 index 000000000..171d7f2f9 --- /dev/null +++ b/app/src/main/java/be/scri/latin/makedict/DictionaryHeader.kt @@ -0,0 +1,72 @@ +@file:Suppress("ktlint", "detekt.all") + +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.makedict + +import be.scri.latin.makedict.FormatSpec.DictionaryOptions +import java.text.DateFormat +import java.util.Date +import java.util.Locale + +/** + * Class representing dictionary header. + */ +class DictionaryHeader( + @JvmField val mDictionaryOptions: DictionaryOptions, +) { + val mLocaleString = mDictionaryOptions.mAttributes[DICTIONARY_LOCALE_KEY] + ?: throw UnsupportedFormatException("Cannot create a FileHeader without a locale") + @JvmField + val mVersionString = mDictionaryOptions.mAttributes[DICTIONARY_VERSION_KEY] + ?: throw UnsupportedFormatException( + "Cannot create a FileHeader without a version" + ) + @JvmField + val mIdString = mDictionaryOptions.mAttributes[DICTIONARY_ID_KEY] + ?: throw UnsupportedFormatException("Cannot create a FileHeader without an ID") + private val mDate = mDictionaryOptions.mAttributes[DICTIONARY_DATE_KEY]?.toIntOrNull() + + val description: String? + // Helper method to get the description + get() = mDictionaryOptions.mAttributes[DICTIONARY_DESCRIPTION_KEY] + + private fun constructLocale(localeStr: String): Locale { + val parts = localeStr.split('_', '-') + return when (parts.size) { + 1 -> Locale(parts[0]) + 2 -> Locale(parts[0], parts[1]) + else -> Locale(parts[0], parts[1], parts[2]) + } + } + + fun info(locale: Locale): String { + val date = if (mDate == null) "" + else DateFormat.getDateInstance(DateFormat.SHORT, locale).format(Date(mDate * 1000L)) + "\n" + return mIdString + "\n" + constructLocale(mLocaleString).getDisplayName(locale) + + "\nv" + mVersionString + "\n" + date + description + } + + companion object { + // Note that these are corresponding definitions in native code in latinime::HeaderPolicy + // and latinime::HeaderReadWriteUtils. + const val DICTIONARY_VERSION_KEY = "version" + const val DICTIONARY_LOCALE_KEY = "locale" + const val DICTIONARY_ID_KEY = "dictionary" + const val DICTIONARY_DESCRIPTION_KEY = "description" + const val DICTIONARY_DATE_KEY = "date" + const val HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO" + const val USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE" + const val FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = + "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID" + const val MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_ENTRY_COUNT" + const val MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_ENTRY_COUNT" + const val MAX_TRIGRAM_COUNT_KEY = "MAX_TRIGRAM_ENTRY_COUNT" + const val ATTRIBUTE_VALUE_TRUE = "1" + const val CODE_POINT_TABLE_KEY = "codePointTable" + } +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/makedict/FormatSpec.java b/app/src/main/java/be/scri/latin/makedict/FormatSpec.java new file mode 100644 index 000000000..4c1c75c70 --- /dev/null +++ b/app/src/main/java/be/scri/latin/makedict/FormatSpec.java @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.makedict; + +import androidx.annotation.NonNull; + +import be.scri.latin.define.DecoderSpecificConstants; + +import java.util.Date; +import java.util.HashMap; + +/** + * Dictionary File Format Specification. + */ +public final class FormatSpec { + + /* + * File header layout is as follows: + * + * v | + * e | MAGIC_NUMBER + version of the file format, 2 bytes. + * r | + * sion + * + * o | + * p | not used, 2 bytes. + * o | + * nflags + * + * h | + * e | size of the file header, 4bytes + * a | including the size of the magic number, the option flags and the header size + * d | + * ersize + * + * attributes list + * + * attributes list is: + * = | string of characters at the char format described below, with the terminator used + * | to signal the end of the string. + * = | string of characters at the char format described below, with the terminator used + * | to signal the end of the string. + * if the size of already read < headersize, goto key. + * + */ + + /* + * Node array (FusionDictionary.PtNodeArray) layout is as follows: + * + * n | + * o | the number of PtNodes, 1 or 2 bytes. + * d | 1 byte = bbbbbbbb match + * e | case 1xxxxxxx => xxxxxxx << 8 + next byte + * c | otherwise => bbbbbbbb + * o | + * unt + * + * n | + * o | sequence of PtNodes, + * d | the layout of each PtNode is described below. + * e | + * s + * + * f | + * o | forward link address, 3byte + * r | 1 byte = bbbbbbbb match + * w | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte) + * a | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte + * r | + * dlinkaddress + */ + + /* Node (FusionDictionary.PtNode) layout is as follows: + * | CHILDREN_ADDRESS_TYPE 2 bits, 11 : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES + * | 10 : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES + * f | 01 : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE + * l | 00 : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS + * a | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS + * g | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL + * s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS + * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS + * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD + * | is possibly offensive ? 1 bit, 1 = yes, 0 = no : FLAG_IS_POSSIBLY_OFFENSIVE + * + * c | IF FLAG_HAS_MULTIPLE_CHARS + * h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers + * a | end 1 byte, = 0 + * r | ELSE + * s | char 1 or 3 bytes + * | END + * + * f | + * r | IF FLAG_IS_TERMINAL + * e | frequency 1 byte + * q | + * + * c | + * h | children address, CHILDREN_ADDRESS_TYPE bytes + * i | This address is relative to the position of this field. + * l | + * drenaddress + * + * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS + * | shortcut string list + * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS + * | bigrams address list + * + * Char format is: + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + * + * bigram address list is: + * = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT + * | addressSign = 1 bit, : FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE + * | 1 = must take -address, 0 = must take +address + * | xx : mask with MASK_BIGRAM_ATTR_ADDRESS_TYPE + * | addressFormat = 2 bits, 00 = unused : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE + * | 01 = 1 byte : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE + * | 10 = 2 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES + * | 11 = 3 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES + * | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY + *

| IF (01 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE == addressFormat) + * | read 1 byte, add top 4 bits + * | ELSIF (10 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES == addressFormat) + * | read 2 bytes, add top 4 bits + * | ELSE // 11 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES == addressFormat + * | read 3 bytes, add top 4 bits + * | END + * | if (FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) then address = -address + * if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) goto bigram_and_shortcut_address_list_is + * + * shortcut string list is: + * = PTNODE_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes. + * = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT + * | reserved = 3 bits, must be 0 + * | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY + * = | string of characters at the char format described above, with the terminator + * | used to signal the end of the string. + * if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT goto flags + */ + + public static final int MAGIC_NUMBER = 0x9BC13AFE; + static final int NOT_A_VERSION_NUMBER = -1; + + // These MUST have the same values as the relevant constants in format_utils.h. + // From version 2.01 on, we use version * 100 + revision as a version number. That allows + // us to change the format during development while having testing devices remove + // older files with each upgrade, while still having a readable versioning scheme. + // When we bump up the dictionary format version, we should update + // ExpandableDictionary.needsToMigrateDictionary() and + // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). + public static final int VERSION2 = 2; + public static final int VERSION201 = 201; + public static final int VERSION202 = 202; + // format version for Fava Dictionaries. + public static final int VERSION_DELIGHT3 = 86736212; + public static final int MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE = VERSION201; + // Dictionary version used for testing. + public static final int VERSION4_ONLY_FOR_TESTING = 399; + public static final int VERSION402 = 402; + public static final int VERSION403 = 403; + public static final int VERSION4 = VERSION403; + public static final int MINIMUM_SUPPORTED_STATIC_VERSION = VERSION202; + public static final int MAXIMUM_SUPPORTED_STATIC_VERSION = VERSION_DELIGHT3; + static final int MINIMUM_SUPPORTED_DYNAMIC_VERSION = VERSION4; + static final int MAXIMUM_SUPPORTED_DYNAMIC_VERSION = VERSION403; + + // TODO: Make this value adaptative to content data, store it in the header, and + // use it in the reading code. + static final int MAX_WORD_LENGTH = DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; + + // These flags are used only in the static dictionary. + static final int MASK_CHILDREN_ADDRESS_TYPE = 0xC0; + static final int FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS = 0x00; + static final int FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE = 0x40; + static final int FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES = 0x80; + static final int FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = 0xC0; + + static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; + + static final int FLAG_IS_TERMINAL = 0x10; + static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; + static final int FLAG_HAS_BIGRAMS = 0x04; + static final int FLAG_IS_NOT_A_WORD = 0x02; + static final int FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; + + static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80; + static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40; + static final int MASK_BIGRAM_ATTR_ADDRESS_TYPE = 0x30; + static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE = 0x10; + static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES = 0x20; + static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES = 0x30; + static final int FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY = 0x0F; + + static final int PTNODE_CHARACTERS_TERMINATOR = 0x1F; + + static final int PTNODE_TERMINATOR_SIZE = 1; + static final int PTNODE_FLAGS_SIZE = 1; + static final int PTNODE_FREQUENCY_SIZE = 1; + static final int PTNODE_MAX_ADDRESS_SIZE = 3; + static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1; + static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; + static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2; + + static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; + static final int INVALID_CHARACTER = -1; + + static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127 + // Large PtNode array size field size is 2 bytes. + static final int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; + static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767 + static final int MAX_BIGRAMS_IN_A_PTNODE = 10000; + static final int MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE = 0xFFFF; + + static final int MAX_TERMINAL_FREQUENCY = 255; + static final int MAX_BIGRAM_FREQUENCY = 15; + + public static final int SHORTCUT_WHITELIST_FREQUENCY = 15; + + // This option needs to be the same numeric value as the one in binary_format.h. + static final int NOT_VALID_WORD = -99; + + static final int UINT8_MAX = 0xFF; + static final int UINT16_MAX = 0xFFFF; + static final int UINT24_MAX = 0xFFFFFF; + static final int MSB8 = 0x80; + static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; + static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; + + /** + * Options global to the dictionary. + */ + public static final class DictionaryOptions { + public final HashMap mAttributes; + public DictionaryOptions(final HashMap attributes) { + mAttributes = attributes; + } + @Override + @NonNull public String toString() { // Convenience method + return toString(0, false); + } + public String toString(final int indentCount, final boolean plumbing) { + final StringBuilder indent = new StringBuilder(); + if (plumbing) { + indent.append("H:"); + } else { + for (int i = 0; i < indentCount; ++i) { + indent.append(" "); + } + } + final StringBuilder s = new StringBuilder(); + for (final String optionKey : mAttributes.keySet()) { + s.append(indent); + s.append(optionKey); + s.append(" = "); + if ("date".equals(optionKey) && !plumbing) { + // Date needs a number of milliseconds, but the dictionary contains seconds + s.append(new Date(1000 * Long.parseLong(mAttributes.get(optionKey)))); + } else { + s.append(mAttributes.get(optionKey)); + } + s.append("\n"); + } + return s.toString(); + } + } + + private FormatSpec() { + // This utility class is not publicly instantiable. + } +} diff --git a/app/src/main/java/be/scri/latin/makedict/NgramProperty.java b/app/src/main/java/be/scri/latin/makedict/NgramProperty.java new file mode 100644 index 000000000..64875dfc9 --- /dev/null +++ b/app/src/main/java/be/scri/latin/makedict/NgramProperty.java @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.makedict; + +import be.scri.latin.NgramContext; + +public class NgramProperty { + public final WeightedString mTargetWord; + public final NgramContext mNgramContext; + + public NgramProperty(final WeightedString targetWord, final NgramContext ngramContext) { + mTargetWord = targetWord; + mNgramContext = ngramContext; + } + + @Override + public int hashCode() { + return mTargetWord.hashCode() ^ mNgramContext.hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if (!(o instanceof NgramProperty n)) return false; + return mTargetWord.equals(n.mTargetWord) && mNgramContext.equals(n.mNgramContext); + } +} diff --git a/app/src/main/java/be/scri/latin/makedict/ProbabilityInfo.java b/app/src/main/java/be/scri/latin/makedict/ProbabilityInfo.java new file mode 100644 index 000000000..95301c8c9 --- /dev/null +++ b/app/src/main/java/be/scri/latin/makedict/ProbabilityInfo.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.makedict; + +import be.scri.inputmethod.latin.BinaryDictionary; + +import java.util.Arrays; + +public final class ProbabilityInfo { + public final int mProbability; + // mTimestamp, mLevel and mCount are historical info. These values are depend on the + // implementation in native code; thus, we must not use them and have any assumptions about + // them except for tests. + public final int mTimestamp; + public final int mLevel; + public final int mCount; + + public static ProbabilityInfo max(final ProbabilityInfo probabilityInfo1, + final ProbabilityInfo probabilityInfo2) { + if (probabilityInfo1 == null) { + return probabilityInfo2; + } + if (probabilityInfo2 == null) { + return probabilityInfo1; + } + return (probabilityInfo1.mProbability > probabilityInfo2.mProbability) ? probabilityInfo1 + : probabilityInfo2; + } + + public ProbabilityInfo(final int probability) { + this(probability, BinaryDictionary.NOT_A_VALID_TIMESTAMP, 0, 0); + } + + public ProbabilityInfo(final int probability, final int timestamp, final int level, + final int count) { + mProbability = probability; + mTimestamp = timestamp; + mLevel = level; + mCount = count; + } + + public boolean hasHistoricalInfo() { + return mTimestamp != BinaryDictionary.NOT_A_VALID_TIMESTAMP; + } + + @Override + public int hashCode() { + if (hasHistoricalInfo()) { + return Arrays.hashCode(new Object[] { mProbability, mTimestamp, mLevel, mCount }); + } + return Arrays.hashCode(new Object[] { mProbability }); + } + + @Override + public String toString() { + return "ProbabilityInfo: mProbability=" + mProbability + ", mTimestamp=" + mTimestamp + ", mLevel=" + mLevel + ", mCount=" + mCount; + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if (!(o instanceof ProbabilityInfo p)) return false; + if (!hasHistoricalInfo() && !p.hasHistoricalInfo()) { + return mProbability == p.mProbability; + } + return mProbability == p.mProbability && mTimestamp == p.mTimestamp && mLevel == p.mLevel + && mCount == p.mCount; + } +} diff --git a/app/src/main/java/be/scri/latin/makedict/UnsupportedFormatException.java b/app/src/main/java/be/scri/latin/makedict/UnsupportedFormatException.java new file mode 100644 index 000000000..94fd736bb --- /dev/null +++ b/app/src/main/java/be/scri/latin/makedict/UnsupportedFormatException.java @@ -0,0 +1,16 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.makedict; + +/** + * Simple exception thrown when a file format is not recognized. + */ +public final class UnsupportedFormatException extends Exception { + public UnsupportedFormatException(String description) { + super(description); + } +} diff --git a/app/src/main/java/be/scri/latin/makedict/WeightedString.java b/app/src/main/java/be/scri/latin/makedict/WeightedString.java new file mode 100644 index 000000000..55e43167d --- /dev/null +++ b/app/src/main/java/be/scri/latin/makedict/WeightedString.java @@ -0,0 +1,48 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.makedict; + +import java.util.Arrays; + +/** + * A string with a probability. + * + * This represents an "attribute", that is either a bigram or a shortcut. + */ +public final class WeightedString { + public final String mWord; + public ProbabilityInfo mProbabilityInfo; + + public WeightedString(final String word, final int probability) { + this(word, new ProbabilityInfo(probability)); + } + + public WeightedString(final String word, final ProbabilityInfo probabilityInfo) { + mWord = word; + mProbabilityInfo = probabilityInfo; + } + + public int getProbability() { + return mProbabilityInfo.mProbability; + } + + public void setProbability(final int probability) { + mProbabilityInfo = new ProbabilityInfo(probability); + } + + @Override + public int hashCode() { + return Arrays.hashCode(new Object[] { mWord, mProbabilityInfo}); + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if (!(o instanceof WeightedString w)) return false; + return mWord.equals(w.mWord) && mProbabilityInfo.equals(w.mProbabilityInfo); + } +} diff --git a/app/src/main/java/be/scri/latin/makedict/WordProperty.java b/app/src/main/java/be/scri/latin/makedict/WordProperty.java new file mode 100644 index 000000000..f1175416a --- /dev/null +++ b/app/src/main/java/be/scri/latin/makedict/WordProperty.java @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.makedict; + +import androidx.annotation.NonNull; +import androidx.annotation.Nullable; + +import be.scri.inputmethod.latin.BinaryDictionary; +import be.scri.latin.dictionary.Dictionary; +import be.scri.latin.NgramContext; +import be.scri.latin.NgramContext.WordInfo; +import be.scri.latin.common.StringUtils; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Utility class for a word with a probability. + *

+ * This is chiefly used to iterate a dictionary. + */ +public final class WordProperty implements Comparable { + public final String mWord; + public final ProbabilityInfo mProbabilityInfo; + public final ArrayList mShortcutTargets; + public final ArrayList mNgrams; + // TODO: Support mIsBeginningOfSentence. + public final boolean mIsBeginningOfSentence; + public final boolean mIsNotAWord; + public final boolean mIsPossiblyOffensive; + public final boolean mHasShortcuts; + public final boolean mHasNgrams; + + private int mHashCode = 0; + + // TODO: Support n-gram. + public WordProperty(final String word, final ProbabilityInfo probabilityInfo, + final ArrayList shortcutTargets, + @Nullable final ArrayList bigrams, + final boolean isNotAWord, final boolean isPossiblyOffensive) { + mWord = word; + mProbabilityInfo = probabilityInfo; + mShortcutTargets = shortcutTargets; + if (null == bigrams) { + mNgrams = null; + } else { + mNgrams = new ArrayList<>(); + final NgramContext ngramContext = new NgramContext(new WordInfo(mWord)); + for (final WeightedString bigramTarget : bigrams) { + mNgrams.add(new NgramProperty(bigramTarget, ngramContext)); + } + } + mIsBeginningOfSentence = false; + mIsNotAWord = isNotAWord; + mIsPossiblyOffensive = isPossiblyOffensive; + mHasNgrams = bigrams != null && !bigrams.isEmpty(); + mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty(); + } + + private static ProbabilityInfo createProbabilityInfoFromArray(final int[] probabilityInfo) { + return new ProbabilityInfo( + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_PROBABILITY_INDEX], + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX], + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_LEVEL_INDEX], + probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_COUNT_INDEX]); + } + + // Construct word property using information from native code. + // This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY. + public WordProperty(final int[] codePoints, final boolean isNotAWord, + final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts, + final boolean isBeginningOfSentence, final int[] probabilityInfo, + final ArrayList ngramPrevWordsArray, + final ArrayList ngramPrevWordIsBeginningOfSentenceArray, + final ArrayList ngramTargets, final ArrayList ngramProbabilityInfo, + final ArrayList shortcutTargets, + final ArrayList shortcutProbabilities) { + mWord = StringUtils.getStringFromNullTerminatedCodePointArray(codePoints); + mProbabilityInfo = createProbabilityInfoFromArray(probabilityInfo); + mShortcutTargets = new ArrayList<>(); + final ArrayList ngrams = new ArrayList<>(); + mIsBeginningOfSentence = isBeginningOfSentence; + mIsNotAWord = isNotAWord; + mIsPossiblyOffensive = isPossiblyOffensive; + mHasShortcuts = hasShortcuts; + mHasNgrams = hasBigram; + + final int relatedNgramCount = ngramTargets.size(); + for (int i = 0; i < relatedNgramCount; i++) { + final String ngramTargetString = + StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i)); + final WeightedString ngramTarget = new WeightedString(ngramTargetString, + createProbabilityInfoFromArray(ngramProbabilityInfo.get(i))); + final int[][] prevWords = ngramPrevWordsArray.get(i); + final boolean[] isBeginningOfSentenceArray = + ngramPrevWordIsBeginningOfSentenceArray.get(i); + final WordInfo[] wordInfoArray = new WordInfo[prevWords.length]; + for (int j = 0; j < prevWords.length; j++) { + wordInfoArray[j] = isBeginningOfSentenceArray[j] + ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO + : new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray( + prevWords[j])); + } + final NgramContext ngramContext = new NgramContext(wordInfoArray); + ngrams.add(new NgramProperty(ngramTarget, ngramContext)); + } + mNgrams = ngrams.isEmpty() ? null : ngrams; + + final int shortcutTargetCount = shortcutTargets.size(); + for (int i = 0; i < shortcutTargetCount; i++) { + final String shortcutTargetString = + StringUtils.getStringFromNullTerminatedCodePointArray(shortcutTargets.get(i)); + mShortcutTargets.add( + new WeightedString(shortcutTargetString, shortcutProbabilities.get(i))); + } + } + + // TODO: Remove + public ArrayList getBigrams() { + if (null == mNgrams) { + return null; + } + final ArrayList bigrams = new ArrayList<>(); + for (final NgramProperty ngram : mNgrams) { + if (ngram.mNgramContext.getPrevWordCount() == 1) { + bigrams.add(ngram.mTargetWord); + } + } + return bigrams; + } + + public int getProbability() { + return mProbabilityInfo.mProbability; + } + + private static int computeHashCode(WordProperty word) { + return Arrays.hashCode(new Object[] { + word.mWord, + word.mProbabilityInfo, + word.mShortcutTargets, + word.mNgrams, + word.mIsNotAWord, + word.mIsPossiblyOffensive + }); + } + + /** + * Three-way comparison. + *

+ * A Word x is greater than a word y if x has a higher frequency. If they have the same + * frequency, they are sorted in lexicographic order. + */ + @Override + public int compareTo(final WordProperty w) { + if (getProbability() < w.getProbability()) return 1; + if (getProbability() > w.getProbability()) return -1; + return mWord.compareTo(w.mWord); + } + + /** + * Equality test. + *

+ * Words are equal if they have the same frequency, the same spellings, and the same + * attributes. + */ + @Override + public boolean equals(Object o) { + if (o == this) return true; + if (!(o instanceof WordProperty w)) return false; + return mProbabilityInfo.equals(w.mProbabilityInfo) && mWord.equals(w.mWord) + && mShortcutTargets.equals(w.mShortcutTargets) && equals(mNgrams, w.mNgrams) + && mIsNotAWord == w.mIsNotAWord && mIsPossiblyOffensive == w.mIsPossiblyOffensive + && mHasNgrams == w.mHasNgrams && mHasShortcuts && w.mHasNgrams; + } + + // TDOO: Have a utility method like java.util.Objects.equals. + private static boolean equals(final ArrayList a, final ArrayList b) { + if (null == a) { + return null == b; + } + return a.equals(b); + } + + @Override + public int hashCode() { + if (mHashCode == 0) { + mHashCode = computeHashCode(this); + } + return mHashCode; + } + + public boolean isValid() { + return getProbability() != Dictionary.NOT_A_PROBABILITY; + } + + @Override + @NonNull public String toString() { + return "WordProperty: mWord=" + mWord + ", mProbabilityInfo=" + mProbabilityInfo; + } +} diff --git a/app/src/main/java/be/scri/latin/settings/SettingsValuesForSuggestion.java b/app/src/main/java/be/scri/latin/settings/SettingsValuesForSuggestion.java new file mode 100644 index 000000000..18169301a --- /dev/null +++ b/app/src/main/java/be/scri/latin/settings/SettingsValuesForSuggestion.java @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.settings; + +public class SettingsValuesForSuggestion { + public final boolean mBlockPotentiallyOffensive; + + public SettingsValuesForSuggestion( + final boolean blockPotentiallyOffensive, + final boolean spaceAwareGesture + ) { + mBlockPotentiallyOffensive = blockPotentiallyOffensive; + mSpaceAwareGesture = spaceAwareGesture; + } + + public final boolean mSpaceAwareGesture; +} diff --git a/app/src/main/java/be/scri/latin/utils/ChecksumCalculator.kt b/app/src/main/java/be/scri/latin/utils/ChecksumCalculator.kt new file mode 100644 index 000000000..c2054e087 --- /dev/null +++ b/app/src/main/java/be/scri/latin/utils/ChecksumCalculator.kt @@ -0,0 +1,41 @@ +@file:Suppress("ktlint", "detekt.all") + +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.utils + +import java.io.File +import java.io.IOException +import java.io.InputStream +import java.security.MessageDigest +import java.security.NoSuchAlgorithmException + +object ChecksumCalculator { + @Throws(IOException::class) + fun checksum(`in`: InputStream): String? { // This code from the Android documentation for MessageDigest. Nearly verbatim. + val digester: MessageDigest = try { + MessageDigest.getInstance("SHA-256") + } catch (_: NoSuchAlgorithmException) { + return null // Platform does not support SHA-256 : can't check, so return null + } + val bytes = ByteArray(8192) + var byteCount: Int + while (`in`.read(bytes).also { byteCount = it } > 0) { + digester.update(bytes, 0, byteCount) + } + val digest = digester.digest() + val s = StringBuilder() + for (i in digest.indices) { + s.append(String.format("%1$02x", digest[i])) + } + return s.toString() + } + + fun checksum(file: File) = runCatching { + file.inputStream().use { checksum(it) } + }.getOrNull() +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/latin/utils/JniUtils.java b/app/src/main/java/be/scri/latin/utils/JniUtils.java new file mode 100644 index 000000000..3935ff6a0 --- /dev/null +++ b/app/src/main/java/be/scri/latin/utils/JniUtils.java @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * modified + * SPDX-License-Identifier: Apache-2.0 AND GPL-3.0-only + */ + +package be.scri.latin.utils; + +public final class JniUtils { + private static final String TAG = JniUtils.class.getSimpleName(); + public static final String JNI_LIB_NAME = "jni_latinime"; + + public static boolean sHaveGestureLib = false; + static { + try { + System.loadLibrary(JNI_LIB_NAME); + sHaveGestureLib = true; + } catch (UnsatisfiedLinkError ul) { + android.util.Log.w(TAG, "Could not load native library " + JNI_LIB_NAME, ul); + } + } + + private JniUtils() { + // This utility class is not publicly instantiable. + } + + public static void loadNativeLibrary() { + // Ensures the static initializer is called + } +} diff --git a/app/src/main/java/be/scri/latin/utils/Log.kt b/app/src/main/java/be/scri/latin/utils/Log.kt new file mode 100644 index 000000000..ee7171c14 --- /dev/null +++ b/app/src/main/java/be/scri/latin/utils/Log.kt @@ -0,0 +1,100 @@ +@file:Suppress("ktlint", "detekt.all") + +package be.scri.latin.utils + +import android.os.Build +import java.time.LocalDateTime +import java.util.Date + +/** + * Logger that does the android logging, but also allows reading the log in the app. + * It's only a little slower than the android logger, but since both are used we end up at + * half performance (still fast enough to not be noticeable, unless spamming thousands of log lines) + */ +object Log { + @JvmStatic + fun wtf(tag: String?, message: String) { + log(LogLine('F', tag, message)) + android.util.Log.wtf(tag, message) + } + + @JvmStatic + fun e(tag: String?, message: String, e: Throwable?) { + log(LogLine('E', tag, "$message\n${e?.stackTraceToString()}")) + android.util.Log.e(tag, message, e) + } + + @JvmStatic + fun e(tag: String?, message: String) { + log(LogLine('E', tag, message)) + android.util.Log.e(tag, message) + } + + @JvmStatic + fun w(tag: String?, message: String, e: Throwable?) { + log(LogLine('W', tag, "$message\n${e?.stackTraceToString()}")) + android.util.Log.w(tag, message, e) + } + + @JvmStatic + fun w(tag: String?, message: String) { + log(LogLine('W', tag, message)) + android.util.Log.w(tag, message) + } + + @JvmStatic + fun i(tag: String?, message: String, e: Throwable?) { + log(LogLine('I', tag, "$message\n${e?.stackTraceToString()}")) + android.util.Log.i(tag, message, e) + } + + @JvmStatic + fun i(tag: String?, message: String) { + log(LogLine('I', tag, message)) + android.util.Log.i(tag, message) + } + + @JvmStatic + fun d(tag: String?, message: String, e: Throwable?) { + log(LogLine('D', tag, "$message\n${e?.stackTraceToString()}")) + android.util.Log.d(tag, message, e) + } + + @JvmStatic + fun d(tag: String?, message: String) { + log(LogLine('D', tag, message)) + android.util.Log.d(tag, message) + } + + @JvmStatic + fun v(tag: String?, message: String) { + log(LogLine('V', tag, message)) + android.util.Log.v(tag, message) + } + + private fun log(line: LogLine) { + synchronized(logLines) { + if (logLines.size > 12000) // clear oldest entries if list gets too long + logLines.subList(0, 2000).clear() + logLines.add(line) + } + } + + private val logLines: MutableList = ArrayList(2000) + + /** returns a copy of [logLines] */ + fun getLog(maxLines: Int = logLines.size) = synchronized(logLines) { logLines.takeLast(maxLines) } +} + +data class LogLine(val level: Char, val tag: String?, val message: String) { + + // time can be Date or LocalDateTime, doesn't matter because but it's used for toString only + private val time = if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + LocalDateTime.now() + } else { + Date(System.currentTimeMillis()) + } + + override fun toString(): String = // should look like a normal android log line, at least for api26+ + "${time.toString().replace('T', ' ')} $level $tag: $message" +} \ No newline at end of file diff --git a/app/src/main/java/be/scri/services/GeneralKeyboardIME.kt b/app/src/main/java/be/scri/services/GeneralKeyboardIME.kt index e59b56c3a..d24ce9f43 100644 --- a/app/src/main/java/be/scri/services/GeneralKeyboardIME.kt +++ b/app/src/main/java/be/scri/services/GeneralKeyboardIME.kt @@ -46,6 +46,7 @@ import be.scri.helpers.DatabaseManagers import be.scri.helpers.EmojiUtils.insertEmoji import be.scri.helpers.KeyboardBase import be.scri.helpers.LanguageMappingConstants.getLanguageAlias +import be.scri.helpers.NativeSuggestionEngine import be.scri.helpers.PreferencesHelper import be.scri.helpers.PreferencesHelper.getHoldKeyStyle import be.scri.helpers.PreferencesHelper.getIsDarkModeOrNot @@ -121,6 +122,7 @@ abstract class GeneralKeyboardIME( private val shiftPermToggleSpeed: Int = DEFAULT_SHIFT_PERM_TOGGLE_SPEED private lateinit var dbManagers: DatabaseManagers + private lateinit var nativeSuggestionEngine: NativeSuggestionEngine private lateinit var suggestionHandler: SuggestionHandler private lateinit var autocompletionHandler: AutocompletionHandler private lateinit var autocompletionManager: AutocompletionDataManager @@ -188,11 +190,19 @@ abstract class GeneralKeyboardIME( override fun onCreate() { super.onCreate() dbManagers = DatabaseManagers(this) + nativeSuggestionEngine = NativeSuggestionEngine(this) suggestionHandler = SuggestionHandler(this) autocompletionManager = dbManagers.autocompletionManager autocompletionHandler = AutocompletionHandler(this) } + override fun onDestroy() { + if (this::nativeSuggestionEngine.isInitialized) { + nativeSuggestionEngine.close() + } + super.onDestroy() + } + /** * Creates the main view for the input method, inflating it from XML and setting up the keyboard. * @@ -1007,8 +1017,14 @@ abstract class GeneralKeyboardIME( fun getAutocompletions( prefix: String, limit: Int = 3, - ): List = - try { + ): List { + if (this::nativeSuggestionEngine.isInitialized) { + val nativeCompletions = nativeSuggestionEngine.getAutocompletions(language, prefix, limit) + if (nativeCompletions.isNotEmpty()) { + return nativeCompletions + } + } + return try { dbManagers.autocompletionManager.getAutocompletions(prefix, limit) } catch (e: SQLiteException) { Log.e("GeneralKeyboardIME", "Database error in autocompletion", e) @@ -1017,6 +1033,7 @@ abstract class GeneralKeyboardIME( Log.e("GeneralKeyboardIME", "Illegal state in autocompletion", e) emptyList() } + } /** * Gets the current text in the command bar without the cursor. @@ -1285,7 +1302,16 @@ abstract class GeneralKeyboardIME( fun getNextWordSuggestions( wordSuggestions: HashMap>, lastWord: String?, - ): List? = lastWord?.let { wordSuggestions[it.lowercase()] } + ): List? { + if (lastWord == null) return null + if (this::nativeSuggestionEngine.isInitialized) { + val nativeSuggestions = nativeSuggestionEngine.getNextWordSuggestions(language, lastWord) + if (nativeSuggestions.isNotEmpty()) { + return nativeSuggestions + } + } + return wordSuggestions[lastWord.lowercase()] + } /** * Finds the required grammatical case(s) for a preposition. @@ -1663,7 +1689,10 @@ abstract class GeneralKeyboardIME( hasLinguisticSuggestions && emojiCount == 0 -> { setSuggestionButton(uiManager.pluralBtn!!, suggestion2) } - + !hasLinguisticSuggestions && emojiCount != 0 -> { + setSuggestionButton(uiManager.binding.translateBtn, suggestion2) + uiManager.updateButtonVisibility(currentState, true, autoSuggestEmojis) + } else -> { setSuggestionButton(uiManager.binding.translateBtn, suggestion2) setSuggestionButton(uiManager.pluralBtn!!, suggestion3) diff --git a/app/src/main/jni/Android.bp b/app/src/main/jni/Android.bp new file mode 100644 index 000000000..5649fc1ea --- /dev/null +++ b/app/src/main/jni/Android.bp @@ -0,0 +1,215 @@ +// Copyright (C) 2013 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +filegroup { + name: "LATIN_IME_CORE_SRC_FILES", + srcs: [ + "src/dictionary/header/header_policy.cpp", + "src/dictionary/header/header_read_write_utils.cpp", + "src/dictionary/property/ngram_context.cpp", + "src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp", + "src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp", + "src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp", + "src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp", + "src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp", + "src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp", + "src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp", + "src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp", + "src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp", + "src/dictionary/structure/v2/patricia_trie_policy.cpp", + "src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp", + "src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp", + "src/dictionary/structure/v4/ver4_dict_buffers.cpp", + "src/dictionary/structure/v4/ver4_dict_constants.cpp", + "src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp", + "src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp", + "src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp", + "src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp", + "src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp", + "src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp", + "src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp", + "src/dictionary/structure/v4/content/language_model_dict_content.cpp", + "src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp", + "src/dictionary/structure/v4/content/shortcut_dict_content.cpp", + "src/dictionary/structure/v4/content/sparse_table_dict_content.cpp", + "src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp", + "src/dictionary/utils/buffer_with_extendable_buffer.cpp", + "src/dictionary/utils/byte_array_utils.cpp", + "src/dictionary/utils/dict_file_writing_utils.cpp", + "src/dictionary/utils/file_utils.cpp", + "src/dictionary/utils/forgetting_curve_utils.cpp", + "src/dictionary/utils/format_utils.cpp", + "src/dictionary/utils/mmapped_buffer.cpp", + "src/dictionary/utils/multi_bigram_map.cpp", + "src/dictionary/utils/probability_utils.cpp", + "src/dictionary/utils/sparse_table.cpp", + "src/dictionary/utils/trie_map.cpp", + "src/suggest/core/suggest.cpp", + "src/suggest/core/dicnode/dic_node.cpp", + "src/suggest/core/dicnode/dic_node_utils.cpp", + "src/suggest/core/dicnode/dic_nodes_cache.cpp", + "src/suggest/core/dictionary/dictionary.cpp", + "src/suggest/core/dictionary/dictionary_utils.cpp", + "src/suggest/core/dictionary/digraph_utils.cpp", + "src/suggest/core/dictionary/error_type_utils.cpp", + "src/suggest/core/layout/additional_proximity_chars.cpp", + "src/suggest/core/layout/proximity_info.cpp", + "src/suggest/core/layout/proximity_info_params.cpp", + "src/suggest/core/layout/proximity_info_state.cpp", + "src/suggest/core/layout/proximity_info_state_utils.cpp", + "src/suggest/core/policy/weighting.cpp", + "src/suggest/core/session/dic_traverse_session.cpp", + "src/suggest/core/result/suggestion_results.cpp", + "src/suggest/core/result/suggestions_output_utils.cpp", + "src/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp", + "src/suggest/policyimpl/typing/scoring_params.cpp", + "src/suggest/policyimpl/typing/typing_scoring.cpp", + "src/suggest/policyimpl/typing/typing_suggest_policy.cpp", + "src/suggest/policyimpl/typing/typing_traversal.cpp", + "src/suggest/policyimpl/typing/typing_weighting.cpp", + "src/utils/autocorrection_threshold_utils.cpp", + "src/utils/char_utils.cpp", + "src/utils/jni_data_utils.cpp", + "src/utils/log_utils.cpp", + "src/utils/time_keeper.cpp", + + // BACKWARD_V402 + "src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp", + "src/dictionary/structure/backward/v402/ver4_dict_constants.cpp", + "src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp", + "src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp", + "src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp", + "src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp", + "src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp", + "src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp", + "src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp", + "src/dictionary/structure/backward/v402/content/probability_dict_content.cpp", + "src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp", + "src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp", + "src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp", + "src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp", + ], +} + +cc_library { + name: "libjni_latinime", + host_supported: true, + product_specific: true, + + sdk_version: "14", + cflags: [ + "-Werror", + "-Wall", + "-Wextra", + "-Weffc++", + "-Wformat=2", + "-Wcast-qual", + "-Wcast-align", + "-Wwrite-strings", + "-Wfloat-equal", + "-Wpointer-arith", + "-Winit-self", + "-Wredundant-decls", + "-Woverloaded-virtual", + "-Wsign-promo", + "-Wno-system-headers", + "-Wno-format-nonliteral", + + // To suppress compiler warnings for unused variables/functions used for debug features etc. + "-Wno-unused-parameter", + "-Wno-unused-function", + ], + local_include_dirs: ["src"], + + srcs: [ + "com_android_inputmethod_keyboard_ProximityInfo.cpp", + "com_android_inputmethod_latin_BinaryDictionary.cpp", + "com_android_inputmethod_latin_BinaryDictionaryUtils.cpp", + "com_android_inputmethod_latin_DicTraverseSession.cpp", + "jni_common.cpp", + + ":LATIN_IME_CORE_SRC_FILES", + ], + + target: { + android_x86: { + // HACK: -mstackrealign is required for x86 builds running on pre-KitKat devices to avoid crashes + // with SSE instructions. + cflags: ["-mstackrealign"], + }, + android: { + stl: "libc++_static", + }, + host: { + cflags: ["-DHOST_TOOL"], + }, + }, +} + +cc_library_static { + name: "liblatinime_static_for_unittests", + host_supported: true, + + cflags: [ + "-Wno-unused-parameter", + "-Wno-unused-function", + "-Wall", + "-Werror", + ], + local_include_dirs: ["src"], + sdk_version: "14", + stl: "libc++_static", + + srcs: [":LATIN_IME_CORE_SRC_FILES"], +} + +cc_test { + name: "liblatinime_unittests", + host_supported: true, + + cflags: [ + "-Wno-unused-parameter", + "-Wno-unused-function", + "-Wall", + "-Werror", + ], + local_include_dirs: ["src"], + sdk_version: "14", + stl: "libc++_static", + + srcs: [ + "tests/defines_test.cpp", + "tests/dictionary/header/header_read_write_utils_test.cpp", + "tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp", + "tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp", + "tests/dictionary/structure/v4/content/probability_entry_test.cpp", + "tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp", + "tests/dictionary/utils/bloom_filter_test.cpp", + "tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp", + "tests/dictionary/utils/byte_array_utils_test.cpp", + "tests/dictionary/utils/format_utils_test.cpp", + "tests/dictionary/utils/probability_utils_test.cpp", + "tests/dictionary/utils/sparse_table_test.cpp", + "tests/dictionary/utils/trie_map_test.cpp", + "tests/suggest/core/dicnode/dic_node_pool_test.cpp", + "tests/suggest/core/layout/geometry_utils_test.cpp", + "tests/suggest/core/layout/normal_distribution_2d_test.cpp", + "tests/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp", + "tests/utils/autocorrection_threshold_utils_test.cpp", + "tests/utils/char_utils_test.cpp", + "tests/utils/int_array_view_test.cpp", + "tests/utils/time_keeper_test.cpp", + ], + static_libs: ["liblatinime_static_for_unittests"], +} diff --git a/app/src/main/jni/Android.mk b/app/src/main/jni/Android.mk new file mode 100755 index 000000000..0099cafb6 --- /dev/null +++ b/app/src/main/jni/Android.mk @@ -0,0 +1,106 @@ +# Copyright (C) 2011 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOCAL_PATH := $(call my-dir) + +############ some local flags +# If you change any of those flags, you need to rebuild both libjni_latinime_common_static +# and the shared library that uses libjni_latinime_common_static. +FLAG_DBG ?= false +FLAG_DO_PROFILE ?= false + +###################################### +include $(CLEAR_VARS) + +LATIN_IME_SRC_DIR := src + +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) + +LOCAL_CFLAGS += -Wall -Wextra -Weffc++ -Wformat=2 -Wcast-qual -Wcast-align \ + -Wwrite-strings -Wfloat-equal -Wpointer-arith -Winit-self -Wredundant-decls \ + -Woverloaded-virtual -Wsign-promo -Wno-system-headers + +# To suppress compiler warnings for unused variables/functions used for debug features etc. +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function + +# HACK: -mstackrealign is required for x86 builds running on pre-KitKat devices to avoid crashes +# with SSE instructions. +ifeq ($(TARGET_ARCH), x86) + LOCAL_CFLAGS += -mstackrealign +endif # x86 + +include $(LOCAL_PATH)/NativeFileList.mk + +LOCAL_SRC_FILES := \ + $(LATIN_IME_JNI_SRC_FILES) \ + $(addprefix $(LATIN_IME_SRC_DIR)/, $(LATIN_IME_CORE_SRC_FILES)) + +ifeq ($(FLAG_DO_PROFILE), true) + $(warning Making profiling version of native library) + LOCAL_CFLAGS += -DFLAG_DO_PROFILE -funwind-tables +else # FLAG_DO_PROFILE +ifeq ($(FLAG_DBG), true) + $(warning Making debug version of native library) + LOCAL_CFLAGS += -DFLAG_DBG -funwind-tables -fno-inline +ifeq ($(FLAG_FULL_DBG), true) + $(warning Making full debug version of native library) + LOCAL_CFLAGS += -DFLAG_FULL_DBG +endif # FLAG_FULL_DBG +endif # FLAG_DBG +endif # FLAG_DO_PROFILE + +LOCAL_MODULE := libjni_latinime_common_static +LOCAL_MODULE_TAGS := optional + +LOCAL_CLANG := true +LOCAL_SDK_VERSION := 14 +LOCAL_NDK_STL_VARIANT := c++_static + +include $(BUILD_STATIC_LIBRARY) +###################################### +include $(CLEAR_VARS) + +# All code in LOCAL_WHOLE_STATIC_LIBRARIES will be built into this shared library. +LOCAL_WHOLE_STATIC_LIBRARIES := libjni_latinime_common_static + +ifeq ($(FLAG_DO_PROFILE), true) + $(warning Making profiling version of native library) + LOCAL_LDFLAGS += -llog +else # FLAG_DO_PROFILE +ifeq ($(FLAG_DBG), true) + $(warning Making debug version of native library) + LOCAL_LDFLAGS += -llog +endif # FLAG_DBG +endif # FLAG_DO_PROFILE + +LOCAL_MODULE := libjni_latinime +LOCAL_MODULE_TAGS := optional + +LOCAL_CLANG := true +LOCAL_SDK_VERSION := 14 +LOCAL_NDK_STL_VARIANT := c++_static +LOCAL_LDFLAGS += -ldl + +# Avoid issues with reproducible builds, see https://gitlab.com/fdroid/rfp/-/issues/2662 +LOCAL_LDFLAGS += -Wl,--build-id=none -Wl,--hash-style=both -Wl,-z,max-page-size=16384 + +include $(BUILD_SHARED_LIBRARY) +#################### Clean up the tmp vars +include $(LOCAL_PATH)/CleanupNativeFileList.mk + +#################### Unit test on host environment +#include $(LOCAL_PATH)/HostUnitTests.mk + +#################### Unit test on target environment +#include $(LOCAL_PATH)/TargetUnitTests.mk diff --git a/app/src/main/jni/Application.mk b/app/src/main/jni/Application.mk new file mode 100755 index 000000000..a169e740b --- /dev/null +++ b/app/src/main/jni/Application.mk @@ -0,0 +1,2 @@ +APP_STL := c++_static +APP_ABI := all diff --git a/app/src/main/jni/CleanupNativeFileList.mk b/app/src/main/jni/CleanupNativeFileList.mk new file mode 100755 index 000000000..eed6f1e63 --- /dev/null +++ b/app/src/main/jni/CleanupNativeFileList.mk @@ -0,0 +1,19 @@ +# Copyright (C) 2013 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LATIN_IME_CORE_SRC_FILES := +LATIN_IME_CORE_SRC_FILES_BACKWARD_V401 := +LATIN_IME_CORE_TEST_FILES := +LATIN_IME_JNI_SRC_FILES := +LATIN_IME_SRC_DIR := diff --git a/app/src/main/jni/HostUnitTests.mk b/app/src/main/jni/HostUnitTests.mk new file mode 100755 index 000000000..6a8bcec2d --- /dev/null +++ b/app/src/main/jni/HostUnitTests.mk @@ -0,0 +1,64 @@ +# Copyright (C) 2014 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Host build is never supported in unbundled (NDK/tapas) build +ifeq (,$(TARGET_BUILD_APPS)) + +# HACK: Temporarily disable host tool build on Mac until the build system is ready for C++11. +LATINIME_HOST_OSNAME := $(shell uname -s) +ifneq ($(LATINIME_HOST_OSNAME), Darwin) # TODO: Remove this + +LOCAL_PATH := $(call my-dir) + +###################################### +include $(CLEAR_VARS) + +include $(LOCAL_PATH)/NativeFileList.mk + +#################### Host library for unit test +LATIN_IME_SRC_DIR := src +LOCAL_ADDRESS_SANITIZER := true +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function +LOCAL_CLANG := true +LOCAL_CXX_STL := libc++ +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) +LOCAL_MODULE := liblatinime_host_static_for_unittests +LOCAL_MODULE_TAGS := optional +LOCAL_SRC_FILES := $(addprefix $(LATIN_IME_SRC_DIR)/, $(LATIN_IME_CORE_SRC_FILES)) +include $(BUILD_HOST_STATIC_LIBRARY) + +#################### Host native tests +include $(CLEAR_VARS) +LATIN_IME_TEST_SRC_DIR := tests +LOCAL_ADDRESS_SANITIZER := true +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function +LOCAL_CLANG := true +LOCAL_CXX_STL := libc++ +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) +LOCAL_MODULE := liblatinime_host_unittests +LOCAL_MODULE_TAGS := tests +LOCAL_SRC_FILES := $(addprefix $(LATIN_IME_TEST_SRC_DIR)/, $(LATIN_IME_CORE_TEST_FILES)) +LOCAL_STATIC_LIBRARIES += liblatinime_host_static_for_unittests +include $(BUILD_HOST_NATIVE_TEST) + +include $(LOCAL_PATH)/CleanupNativeFileList.mk + +endif # Darwin - TODO: Remove this + +endif # TARGET_BUILD_APPS + +#################### Clean up the tmp vars +LATINIME_HOST_OSNAME := +LATIN_IME_SRC_DIR := +LATIN_IME_TEST_SRC_DIR := diff --git a/app/src/main/jni/NativeFileList.mk b/app/src/main/jni/NativeFileList.mk new file mode 100755 index 000000000..d8b69bfd7 --- /dev/null +++ b/app/src/main/jni/NativeFileList.mk @@ -0,0 +1,146 @@ +# Copyright (C) 2013 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LATIN_IME_JNI_SRC_FILES := \ + com_android_inputmethod_keyboard_ProximityInfo.cpp \ + com_android_inputmethod_latin_BinaryDictionary.cpp \ + com_android_inputmethod_latin_BinaryDictionaryUtils.cpp \ + com_android_inputmethod_latin_DicTraverseSession.cpp \ + jni_common.cpp + +LATIN_IME_CORE_SRC_FILES := \ + $(addprefix dictionary/header/, \ + header_policy.cpp \ + header_read_write_utils.cpp) \ + dictionary/property/ngram_context.cpp \ + dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp \ + $(addprefix dictionary/structure/pt_common/, \ + bigram/bigram_list_read_write_utils.cpp \ + dynamic_pt_gc_event_listeners.cpp \ + dynamic_pt_reading_helper.cpp \ + dynamic_pt_reading_utils.cpp \ + dynamic_pt_updating_helper.cpp \ + dynamic_pt_writing_utils.cpp \ + patricia_trie_reading_utils.cpp \ + shortcut/shortcut_list_reading_utils.cpp) \ + $(addprefix dictionary/structure/v2/, \ + patricia_trie_policy.cpp \ + ver2_patricia_trie_node_reader.cpp \ + ver2_pt_node_array_reader.cpp) \ + $(addprefix dictionary/structure/v4/, \ + ver4_dict_buffers.cpp \ + ver4_dict_constants.cpp \ + ver4_patricia_trie_node_reader.cpp \ + ver4_patricia_trie_node_writer.cpp \ + ver4_patricia_trie_policy.cpp \ + ver4_patricia_trie_reading_utils.cpp \ + ver4_patricia_trie_writing_helper.cpp \ + ver4_pt_node_array_reader.cpp) \ + $(addprefix dictionary/structure/v4/content/, \ + dynamic_language_model_probability_utils.cpp \ + language_model_dict_content.cpp \ + language_model_dict_content_global_counters.cpp \ + shortcut_dict_content.cpp \ + sparse_table_dict_content.cpp \ + terminal_position_lookup_table.cpp) \ + $(addprefix dictionary/utils/, \ + buffer_with_extendable_buffer.cpp \ + byte_array_utils.cpp \ + dict_file_writing_utils.cpp \ + file_utils.cpp \ + forgetting_curve_utils.cpp \ + format_utils.cpp \ + mmapped_buffer.cpp \ + multi_bigram_map.cpp \ + probability_utils.cpp \ + sparse_table.cpp \ + trie_map.cpp ) \ + suggest/core/suggest.cpp \ + $(addprefix suggest/core/dicnode/, \ + dic_node.cpp \ + dic_node_utils.cpp \ + dic_nodes_cache.cpp) \ + $(addprefix suggest/core/dictionary/, \ + dictionary.cpp \ + dictionary_utils.cpp \ + digraph_utils.cpp \ + error_type_utils.cpp ) \ + $(addprefix suggest/core/layout/, \ + additional_proximity_chars.cpp \ + proximity_info.cpp \ + proximity_info_params.cpp \ + proximity_info_state.cpp \ + proximity_info_state_utils.cpp) \ + suggest/core/policy/weighting.cpp \ + suggest/core/session/dic_traverse_session.cpp \ + $(addprefix suggest/core/result/, \ + suggestion_results.cpp \ + suggestions_output_utils.cpp) \ + suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \ + $(addprefix suggest/policyimpl/typing/, \ + scoring_params.cpp \ + typing_scoring.cpp \ + typing_suggest_policy.cpp \ + typing_traversal.cpp \ + typing_weighting.cpp) \ + $(addprefix utils/, \ + autocorrection_threshold_utils.cpp \ + char_utils.cpp \ + jni_data_utils.cpp \ + log_utils.cpp \ + time_keeper.cpp) + +LATIN_IME_CORE_SRC_FILES_BACKWARD_V402 := \ + $(addprefix dictionary/structure/backward/v402/, \ + ver4_dict_buffers.cpp \ + ver4_dict_constants.cpp \ + ver4_patricia_trie_node_reader.cpp \ + ver4_patricia_trie_node_writer.cpp \ + ver4_patricia_trie_policy.cpp \ + ver4_patricia_trie_reading_utils.cpp \ + ver4_patricia_trie_writing_helper.cpp \ + ver4_pt_node_array_reader.cpp) \ + $(addprefix dictionary/structure/backward/v402/content/, \ + bigram_dict_content.cpp \ + probability_dict_content.cpp \ + shortcut_dict_content.cpp \ + sparse_table_dict_content.cpp \ + terminal_position_lookup_table.cpp) \ + $(addprefix dictionary/structure/backward/v402/bigram/, \ + ver4_bigram_list_policy.cpp) + +LATIN_IME_CORE_SRC_FILES += $(LATIN_IME_CORE_SRC_FILES_BACKWARD_V402) + +LATIN_IME_CORE_TEST_FILES := \ + defines_test.cpp \ + dictionary/header/header_read_write_utils_test.cpp \ + dictionary/structure/v4/content/language_model_dict_content_test.cpp \ + dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp \ + dictionary/structure/v4/content/probability_entry_test.cpp \ + dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp \ + dictionary/utils/bloom_filter_test.cpp \ + dictionary/utils/buffer_with_extendable_buffer_test.cpp \ + dictionary/utils/byte_array_utils_test.cpp \ + dictionary/utils/format_utils_test.cpp \ + dictionary/utils/probability_utils_test.cpp \ + dictionary/utils/sparse_table_test.cpp \ + dictionary/utils/trie_map_test.cpp \ + suggest/core/dicnode/dic_node_pool_test.cpp \ + suggest/core/layout/geometry_utils_test.cpp \ + suggest/core/layout/normal_distribution_2d_test.cpp \ + suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp \ + utils/autocorrection_threshold_utils_test.cpp \ + utils/char_utils_test.cpp \ + utils/int_array_view_test.cpp \ + utils/time_keeper_test.cpp diff --git a/app/src/main/jni/TargetUnitTests.mk b/app/src/main/jni/TargetUnitTests.mk new file mode 100755 index 000000000..69a32edbd --- /dev/null +++ b/app/src/main/jni/TargetUnitTests.mk @@ -0,0 +1,52 @@ +# Copyright (C) 2014 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOCAL_PATH := $(call my-dir) + +###################################### +include $(CLEAR_VARS) + +include $(LOCAL_PATH)/NativeFileList.mk + +#################### Target library for unit test +LATIN_IME_SRC_DIR := src +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function +LOCAL_CLANG := true +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) +LOCAL_MODULE := liblatinime_target_static_for_unittests +LOCAL_MODULE_TAGS := optional +LOCAL_SRC_FILES := $(addprefix $(LATIN_IME_SRC_DIR)/, $(LATIN_IME_CORE_SRC_FILES)) +LOCAL_SDK_VERSION := 14 +LOCAL_NDK_STL_VARIANT := c++_static +include $(BUILD_STATIC_LIBRARY) + +#################### Target native tests +include $(CLEAR_VARS) +LATIN_IME_TEST_SRC_DIR := tests +LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function +LOCAL_CLANG := true +LOCAL_C_INCLUDES += $(LOCAL_PATH)/$(LATIN_IME_SRC_DIR) +LOCAL_MODULE := liblatinime_target_unittests +LOCAL_MODULE_TAGS := tests +LOCAL_SRC_FILES := \ + $(addprefix $(LATIN_IME_TEST_SRC_DIR)/, $(LATIN_IME_CORE_TEST_FILES)) +LOCAL_STATIC_LIBRARIES += liblatinime_target_static_for_unittests +LOCAL_SDK_VERSION := 14 +LOCAL_NDK_STL_VARIANT := c++_static +include $(BUILD_NATIVE_TEST) + +#################### Clean up the tmp vars +LATIN_IME_SRC_DIR := +LATIN_IME_TEST_SRC_DIR := +include $(LOCAL_PATH)/CleanupNativeFileList.mk diff --git a/app/src/main/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp b/app/src/main/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp new file mode 100644 index 000000000..c55852d10 --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_keyboard_ProximityInfo.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: jni: ProximityInfo" + +#include "com_android_inputmethod_keyboard_ProximityInfo.h" + +#include "defines.h" +#include "jni.h" +#include "jni_common.h" +#include "suggest/core/layout/proximity_info.h" + +namespace latinime { + +static jlong latinime_Keyboard_setProximityInfo(JNIEnv *env, jclass clazz, + jint displayWidth, jint displayHeight, jint gridWidth, jint gridHeight, + jint mostCommonkeyWidth, jint mostCommonkeyHeight, jintArray proximityChars, jint keyCount, + jintArray keyXCoordinates, jintArray keyYCoordinates, jintArray keyWidths, + jintArray keyHeights, jintArray keyCharCodes, jfloatArray sweetSpotCenterXs, + jfloatArray sweetSpotCenterYs, jfloatArray sweetSpotRadii) { + ProximityInfo *proximityInfo = new ProximityInfo(env, displayWidth, displayHeight, + gridWidth, gridHeight, mostCommonkeyWidth, mostCommonkeyHeight, proximityChars, + keyCount, keyXCoordinates, keyYCoordinates, keyWidths, keyHeights, keyCharCodes, + sweetSpotCenterXs, sweetSpotCenterYs, sweetSpotRadii); + return reinterpret_cast(proximityInfo); +} + +static void latinime_Keyboard_release(JNIEnv *env, jclass clazz, jlong proximityInfo) { + ProximityInfo *pi = reinterpret_cast(proximityInfo); + delete pi; +} + +static const JNINativeMethod sMethods[] = { + { + const_cast("setProximityInfoNative"), + const_cast("(IIIIII[II[I[I[I[I[I[F[F[F)J"), + reinterpret_cast(latinime_Keyboard_setProximityInfo) + }, + { + const_cast("releaseProximityInfoNative"), + const_cast("(J)V"), + reinterpret_cast(latinime_Keyboard_release) + } +}; + +int register_ProximityInfo(JNIEnv *env) { + const char *const kClassPathName = "be/scri/inputmethod/keyboard/ProximityInfo"; + return registerNativeMethods(env, kClassPathName, sMethods, NELEMS(sMethods)); +} +} // namespace latinime diff --git a/app/src/main/jni/com_android_inputmethod_keyboard_ProximityInfo.h b/app/src/main/jni/com_android_inputmethod_keyboard_ProximityInfo.h new file mode 100644 index 000000000..c3503c8c3 --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_keyboard_ProximityInfo.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _COM_ANDROID_INPUTMETHOD_KEYBOARD_PROXIMITYINFO_H +#define _COM_ANDROID_INPUTMETHOD_KEYBOARD_PROXIMITYINFO_H + +#include "jni.h" + +namespace latinime { +int register_ProximityInfo(JNIEnv *env); +} // namespace latinime +#endif // _COM_ANDROID_INPUTMETHOD_KEYBOARD_PROXIMITYINFO_H diff --git a/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionary.cpp new file mode 100644 index 000000000..32c545bb6 --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -0,0 +1,744 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: jni: BinaryDictionary" + +#include "com_android_inputmethod_latin_BinaryDictionary.h" + +#include // for memset() +#include + +#include "defines.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" +#include "jni.h" +#include "jni_common.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/result/suggestion_results.h" +#include "suggest/core/suggest_options.h" +#include "utils/char_utils.h" +#include "utils/int_array_view.h" +#include "utils/jni_data_utils.h" +#include "utils/log_utils.h" +#include "utils/profiler.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class ProximityInfo; + +static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring sourceDir, + jlong dictOffset, jlong dictSize, jboolean isUpdatable) { + PROF_INIT; + PROF_TIMER_START(66); + const jsize sourceDirUtf8Length = env->GetStringUTFLength(sourceDir); + if (sourceDirUtf8Length <= 0) { + AKLOGE("DICT: Can't get sourceDir string"); + return 0; + } + char sourceDirChars[sourceDirUtf8Length + 1]; + env->GetStringUTFRegion(sourceDir, 0, env->GetStringLength(sourceDir), sourceDirChars); + sourceDirChars[sourceDirUtf8Length] = '\0'; + DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy( + DictionaryStructureWithBufferPolicyFactory::newPolicyForExistingDictFile( + sourceDirChars, static_cast(dictOffset), static_cast(dictSize), + isUpdatable == JNI_TRUE)); + if (!dictionaryStructureWithBufferPolicy) { + return 0; + } + + Dictionary *const dictionary = + new Dictionary(env, std::move(dictionaryStructureWithBufferPolicy)); + PROF_TIMER_END(66); + return reinterpret_cast(dictionary); +} + +static jlong latinime_BinaryDictionary_createOnMemory(JNIEnv *env, jclass clazz, + jlong formatVersion, jstring locale, jobjectArray attributeKeyStringArray, + jobjectArray attributeValueStringArray) { + const jsize localeUtf8Length = env->GetStringUTFLength(locale); + char localeChars[localeUtf8Length + 1]; + env->GetStringUTFRegion(locale, 0, env->GetStringLength(locale), localeChars); + localeChars[localeUtf8Length] = '\0'; + std::vector localeCodePoints; + HeaderReadWriteUtils::insertCharactersIntoVector(localeChars, &localeCodePoints); + const int keyCount = env->GetArrayLength(attributeKeyStringArray); + const int valueCount = env->GetArrayLength(attributeValueStringArray); + if (keyCount != valueCount) { + return false; + } + DictionaryHeaderStructurePolicy::AttributeMap attributeMap = + JniDataUtils::constructAttributeMap(env, attributeKeyStringArray, + attributeValueStringArray); + DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy = + DictionaryStructureWithBufferPolicyFactory::newPolicyForOnMemoryDict( + formatVersion, localeCodePoints, &attributeMap); + if (!dictionaryStructureWithBufferPolicy) { + return 0; + } + Dictionary *const dictionary = + new Dictionary(env, std::move(dictionaryStructureWithBufferPolicy)); + return reinterpret_cast(dictionary); +} + +static bool latinime_BinaryDictionary_flush(JNIEnv *env, jclass clazz, jlong dict, + jstring filePath) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return false; + const jsize filePathUtf8Length = env->GetStringUTFLength(filePath); + char filePathChars[filePathUtf8Length + 1]; + env->GetStringUTFRegion(filePath, 0, env->GetStringLength(filePath), filePathChars); + filePathChars[filePathUtf8Length] = '\0'; + return dictionary->flush(filePathChars); +} + +static bool latinime_BinaryDictionary_needsToRunGC(JNIEnv *env, jclass clazz, + jlong dict, jboolean mindsBlockByGC) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return false; + return dictionary->needsToRunGC(mindsBlockByGC == JNI_TRUE); +} + +static bool latinime_BinaryDictionary_flushWithGC(JNIEnv *env, jclass clazz, jlong dict, + jstring filePath) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return false; + const jsize filePathUtf8Length = env->GetStringUTFLength(filePath); + char filePathChars[filePathUtf8Length + 1]; + env->GetStringUTFRegion(filePath, 0, env->GetStringLength(filePath), filePathChars); + filePathChars[filePathUtf8Length] = '\0'; + return dictionary->flushWithGC(filePathChars); +} + +static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return; + delete dictionary; +} + +static void latinime_BinaryDictionary_getHeaderInfo(JNIEnv *env, jclass clazz, jlong dict, + jintArray outHeaderSize, jintArray outFormatVersion, jobject outAttributeKeys, + jobject outAttributeValues) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return; + const DictionaryHeaderStructurePolicy *const headerPolicy = + dictionary->getDictionaryStructurePolicy()->getHeaderStructurePolicy(); + JniDataUtils::putIntToArray(env, outHeaderSize, 0 /* index */, headerPolicy->getSize()); + JniDataUtils::putIntToArray(env, outFormatVersion, 0 /* index */, + headerPolicy->getFormatVersionNumber()); + // Output attribute map + jclass arrayListClass = env->FindClass("java/util/ArrayList"); + jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap = + headerPolicy->getAttributeMap(); + for (DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it = attributeMap->begin(); + it != attributeMap->end(); ++it) { + // Output key + jintArray keyCodePointArray = env->NewIntArray(it->first.size()); + JniDataUtils::outputCodePoints(env, keyCodePointArray, 0 /* start */, + it->first.size(), it->first.data(), it->first.size(), + false /* needsNullTermination */); + env->CallBooleanMethod(outAttributeKeys, addMethodId, keyCodePointArray); + env->DeleteLocalRef(keyCodePointArray); + // Output value + jintArray valueCodePointArray = env->NewIntArray(it->second.size()); + JniDataUtils::outputCodePoints(env, valueCodePointArray, 0 /* start */, + it->second.size(), it->second.data(), it->second.size(), + false /* needsNullTermination */); + env->CallBooleanMethod(outAttributeValues, addMethodId, valueCodePointArray); + env->DeleteLocalRef(valueCodePointArray); + } + env->DeleteLocalRef(arrayListClass); + return; +} + +static int latinime_BinaryDictionary_getFormatVersion(JNIEnv *env, jclass clazz, jlong dict) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return 0; + const DictionaryHeaderStructurePolicy *const headerPolicy = + dictionary->getDictionaryStructurePolicy()->getHeaderStructurePolicy(); + return headerPolicy->getFormatVersionNumber(); +} + +static void latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz, jlong dict, + jlong proximityInfo, jlong dicTraverseSession, jintArray xCoordinatesArray, + jintArray yCoordinatesArray, jintArray timesArray, jintArray pointerIdsArray, + jintArray inputCodePointsArray, jint inputSize, jintArray suggestOptions, + jobjectArray prevWordCodePointArrays, jbooleanArray isBeginningOfSentenceArray, + jint prevWordCount, jintArray outSuggestionCount, jintArray outCodePointsArray, + jintArray outScoresArray, jintArray outSpaceIndicesArray, jintArray outTypesArray, + jintArray outAutoCommitFirstWordConfidenceArray, + jfloatArray inOutWeightOfLangModelVsSpatialModel) { + Dictionary *dictionary = reinterpret_cast(dict); + // Assign 0 to outSuggestionCount here in case of returning earlier in this method. + JniDataUtils::putIntToArray(env, outSuggestionCount, 0 /* index */, 0); + if (!dictionary) { + return; + } + ProximityInfo *pInfo = reinterpret_cast(proximityInfo); + DicTraverseSession *traverseSession = + reinterpret_cast(dicTraverseSession); + if (!traverseSession) { + return; + } + // Input values + int xCoordinates[inputSize]; + int yCoordinates[inputSize]; + int times[inputSize]; + int pointerIds[inputSize]; + const jsize inputCodePointsLength = env->GetArrayLength(inputCodePointsArray); + int inputCodePoints[inputCodePointsLength]; + env->GetIntArrayRegion(xCoordinatesArray, 0, inputSize, xCoordinates); + env->GetIntArrayRegion(yCoordinatesArray, 0, inputSize, yCoordinates); + env->GetIntArrayRegion(timesArray, 0, inputSize, times); + env->GetIntArrayRegion(pointerIdsArray, 0, inputSize, pointerIds); + env->GetIntArrayRegion(inputCodePointsArray, 0, inputCodePointsLength, inputCodePoints); + + const jsize numberOfOptions = env->GetArrayLength(suggestOptions); + int options[numberOfOptions]; + env->GetIntArrayRegion(suggestOptions, 0, numberOfOptions, options); + SuggestOptions givenSuggestOptions(options, numberOfOptions); + + // Output values + /* By the way, let's check the output array length here to make sure */ + const jsize outputCodePointsLength = env->GetArrayLength(outCodePointsArray); + if (outputCodePointsLength != (MAX_WORD_LENGTH * MAX_RESULTS)) { + AKLOGE("Invalid outputCodePointsLength: %d", outputCodePointsLength); + ASSERT(false); + return; + } + const jsize scoresLength = env->GetArrayLength(outScoresArray); + if (scoresLength != MAX_RESULTS) { + AKLOGE("Invalid scoresLength: %d", scoresLength); + ASSERT(false); + return; + } + const jsize outputAutoCommitFirstWordConfidenceLength = + env->GetArrayLength(outAutoCommitFirstWordConfidenceArray); + ASSERT(outputAutoCommitFirstWordConfidenceLength == 1); + if (outputAutoCommitFirstWordConfidenceLength != 1) { + // We only use the first result, as obviously we will only ever autocommit the first one + AKLOGE("Invalid outputAutoCommitFirstWordConfidenceLength: %d", + outputAutoCommitFirstWordConfidenceLength); + ASSERT(false); + return; + } + float weightOfLangModelVsSpatialModel; + env->GetFloatArrayRegion(inOutWeightOfLangModelVsSpatialModel, 0, 1 /* len */, + &weightOfLangModelVsSpatialModel); + SuggestionResults suggestionResults(MAX_RESULTS); + const NgramContext ngramContext = JniDataUtils::constructNgramContext(env, + prevWordCodePointArrays, isBeginningOfSentenceArray, prevWordCount); + if (givenSuggestOptions.isGesture() || inputSize > 0) { + // TODO: Use SuggestionResults to return suggestions. + dictionary->getSuggestions(pInfo, traverseSession, xCoordinates, yCoordinates, + times, pointerIds, inputCodePoints, inputSize, &ngramContext, + &givenSuggestOptions, weightOfLangModelVsSpatialModel, &suggestionResults); + } else { + dictionary->getPredictions(&ngramContext, &suggestionResults); + } + if (DEBUG_DICT) { + suggestionResults.dumpSuggestions(); + } + suggestionResults.outputSuggestions(env, outSuggestionCount, outCodePointsArray, + outScoresArray, outSpaceIndicesArray, outTypesArray, + outAutoCommitFirstWordConfidenceArray, inOutWeightOfLangModelVsSpatialModel); +} + +static jint latinime_BinaryDictionary_getProbability(JNIEnv *env, jclass clazz, jlong dict, + jintArray word) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return NOT_A_PROBABILITY; + const jsize codePointCount = env->GetArrayLength(word); + int codePoints[codePointCount]; + env->GetIntArrayRegion(word, 0, codePointCount, codePoints); + return dictionary->getProbability(CodePointArrayView(codePoints, codePointCount)); +} + +static jint latinime_BinaryDictionary_getMaxProbabilityOfExactMatches( + JNIEnv *env, jclass clazz, jlong dict, jintArray word) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return NOT_A_PROBABILITY; + const jsize codePointCount = env->GetArrayLength(word); + int codePoints[codePointCount]; + env->GetIntArrayRegion(word, 0, codePointCount, codePoints); + return dictionary->getMaxProbabilityOfExactMatches( + CodePointArrayView(codePoints, codePointCount)); +} + +static jint latinime_BinaryDictionary_getNgramProbability(JNIEnv *env, jclass clazz, + jlong dict, jobjectArray prevWordCodePointArrays, jbooleanArray isBeginningOfSentenceArray, + jintArray word) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return JNI_FALSE; + const jsize wordLength = env->GetArrayLength(word); + int wordCodePoints[wordLength]; + env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); + const NgramContext ngramContext = JniDataUtils::constructNgramContext(env, + prevWordCodePointArrays, isBeginningOfSentenceArray, + env->GetArrayLength(prevWordCodePointArrays)); + return dictionary->getNgramProbability(&ngramContext, + CodePointArrayView(wordCodePoints, wordLength)); +} + +// Method to iterate all words in the dictionary for makedict. +// If token is 0, this method newly starts iterating the dictionary. This method returns 0 when +// the dictionary does not have a next word. +static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz, + jlong dict, jint token, jintArray outCodePoints, jbooleanArray outIsBeginningOfSentence) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return 0; + const jsize codePointBufSize = env->GetArrayLength(outCodePoints); + if (codePointBufSize != MAX_WORD_LENGTH) { + AKLOGE("Invalid outCodePointsLength: %d", codePointBufSize); + ASSERT(false); + return 0; + } + int wordCodePoints[codePointBufSize]; + int wordCodePointCount = 0; + const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints, + &wordCodePointCount); + JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, + MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount, + false /* needsNullTermination */); + bool isBeginningOfSentence = false; + if (wordCodePointCount > 0 && wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { + isBeginningOfSentence = true; + } + JniDataUtils::putBooleanToArray(env, outIsBeginningOfSentence, 0 /* index */, + isBeginningOfSentence); + return nextToken; +} + +static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, + jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints, + jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray, + jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets, + jobject outNgramProbabilityInfo, jobject outShortcutTargets, + jobject outShortcutProbabilities) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return; + const jsize wordLength = env->GetArrayLength(word); + if (wordLength > MAX_WORD_LENGTH) { + AKLOGE("Invalid wordLength: %d", wordLength); + return; + } + int wordCodePoints[MAX_WORD_LENGTH]; + env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); + int codePointCount = wordLength; + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker( + wordCodePoints, wordLength, MAX_WORD_LENGTH); + if (codePointCount < 0) { + AKLOGE("Cannot attach Beginning-of-Sentence marker."); + return; + } + } + const WordProperty wordProperty = dictionary->getWordProperty( + CodePointArrayView(wordCodePoints, codePointCount)); + JniDataUtils::outputWordProperty(env, wordProperty, outCodePoints, outFlags, outProbabilityInfo, + outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray, + outNgramTargets, outNgramProbabilityInfo, outShortcutTargets, outShortcutProbabilities); +} + +static bool latinime_BinaryDictionary_addUnigramEntry(JNIEnv *env, jclass clazz, jlong dict, + jintArray word, jint probability, jintArray shortcutTarget, jint shortcutProbability, + jboolean isBeginningOfSentence, jboolean isNotAWord, jboolean isPossiblyOffensive, + jint timestamp) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return false; + } + jsize codePointCount = env->GetArrayLength(word); + int codePoints[codePointCount]; + env->GetIntArrayRegion(word, 0, codePointCount, codePoints); + std::vector shortcuts; + { + std::vector shortcutTargetCodePoints; + JniDataUtils::jintarrayToVector(env, shortcutTarget, &shortcutTargetCodePoints); + if (!shortcutTargetCodePoints.empty()) { + shortcuts.emplace_back(std::move(shortcutTargetCodePoints), shortcutProbability); + } + } + // Use 1 for count to indicate the word has inputted. + const UnigramProperty unigramProperty(isBeginningOfSentence, isNotAWord, + isPossiblyOffensive, probability, HistoricalInfo(timestamp, 0 /* level */, + 1 /* count */), std::move(shortcuts)); + return dictionary->addUnigramEntry(CodePointArrayView(codePoints, codePointCount), + &unigramProperty); +} + +static bool latinime_BinaryDictionary_removeUnigramEntry(JNIEnv *env, jclass clazz, jlong dict, + jintArray word) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return false; + } + jsize codePointCount = env->GetArrayLength(word); + int codePoints[codePointCount]; + env->GetIntArrayRegion(word, 0, codePointCount, codePoints); + return dictionary->removeUnigramEntry(CodePointArrayView(codePoints, codePointCount)); +} + +static bool latinime_BinaryDictionary_addNgramEntry(JNIEnv *env, jclass clazz, jlong dict, + jobjectArray prevWordCodePointArrays, jbooleanArray isBeginningOfSentenceArray, + jintArray word, jint probability, jint timestamp) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return false; + } + const NgramContext ngramContext = JniDataUtils::constructNgramContext(env, + prevWordCodePointArrays, isBeginningOfSentenceArray, + env->GetArrayLength(prevWordCodePointArrays)); + jsize wordLength = env->GetArrayLength(word); + int wordCodePoints[wordLength]; + env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); + // Use 1 for count to indicate the ngram has inputted. + const NgramProperty ngramProperty(ngramContext, + CodePointArrayView(wordCodePoints, wordLength).toVector(), + probability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */)); + return dictionary->addNgramEntry(&ngramProperty); +} + +static bool latinime_BinaryDictionary_removeNgramEntry(JNIEnv *env, jclass clazz, jlong dict, + jobjectArray prevWordCodePointArrays, jbooleanArray isBeginningOfSentenceArray, + jintArray word) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return false; + } + const NgramContext ngramContext = JniDataUtils::constructNgramContext(env, + prevWordCodePointArrays, isBeginningOfSentenceArray, + env->GetArrayLength(prevWordCodePointArrays)); + jsize codePointCount = env->GetArrayLength(word); + int wordCodePoints[codePointCount]; + env->GetIntArrayRegion(word, 0, codePointCount, wordCodePoints); + return dictionary->removeNgramEntry(&ngramContext, + CodePointArrayView(wordCodePoints, codePointCount)); +} + +static bool latinime_BinaryDictionary_updateEntriesForWordWithNgramContext(JNIEnv *env, + jclass clazz, jlong dict, jobjectArray prevWordCodePointArrays, + jbooleanArray isBeginningOfSentenceArray, jintArray word, jboolean isValidWord, jint count, + jint timestamp) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return false; + } + const NgramContext ngramContext = JniDataUtils::constructNgramContext(env, + prevWordCodePointArrays, isBeginningOfSentenceArray, + env->GetArrayLength(prevWordCodePointArrays)); + jsize codePointCount = env->GetArrayLength(word); + int wordCodePoints[codePointCount]; + env->GetIntArrayRegion(word, 0, codePointCount, wordCodePoints); + const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); + return dictionary->updateEntriesForWordWithNgramContext(&ngramContext, + CodePointArrayView(wordCodePoints, codePointCount), isValidWord == JNI_TRUE, + historicalInfo); +} + +// Returns how many input events are processed. +static int latinime_BinaryDictionary_updateEntriesForInputEvents(JNIEnv *env, jclass clazz, + jlong dict, jobjectArray inputEvents, jint startIndex) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return 0; + } + jsize inputEventCount = env->GetArrayLength(inputEvents); + if (inputEventCount == 0 || startIndex >= inputEventCount) { + return 0; + } + jobject inputEvent = env->GetObjectArrayElement(inputEvents, 0); + jclass wordInputEventClass = env->GetObjectClass(inputEvent); + env->DeleteLocalRef(inputEvent); + + jfieldID targetWordFieldId = env->GetFieldID(wordInputEventClass, "mTargetWord", "[I"); + jfieldID prevWordCountFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordsCount", "I"); + jfieldID prevWordArrayFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordArray", "[[I"); + jfieldID isPrevWordBoSArrayFieldId = + env->GetFieldID(wordInputEventClass, "mIsPrevWordBeginningOfSentenceArray", "[Z"); + jfieldID isValidFieldId = env->GetFieldID(wordInputEventClass, "mIsValid", "Z"); + jfieldID timestampFieldId = env->GetFieldID(wordInputEventClass, "mTimestamp", "I"); + env->DeleteLocalRef(wordInputEventClass); + + for (int i = startIndex; i < inputEventCount; ++i) { + jobject inputEvent = env->GetObjectArrayElement(inputEvents, i); + jintArray targetWord = static_cast( + env->GetObjectField(inputEvent, targetWordFieldId)); + jsize wordLength = env->GetArrayLength(targetWord); + int wordCodePoints[wordLength]; + env->GetIntArrayRegion(targetWord, 0, wordLength, wordCodePoints); + env->DeleteLocalRef(targetWord); + + jint prevWordCount = env->GetIntField(inputEvent, prevWordCountFieldId); + jobjectArray prevWordArray = + static_cast(env->GetObjectField(inputEvent, prevWordArrayFieldId)); + jbooleanArray isPrevWordBeginningOfSentenceArray = static_cast( + env->GetObjectField(inputEvent, isPrevWordBoSArrayFieldId)); + jboolean isValid = env->GetBooleanField(inputEvent, isValidFieldId); + jint timestamp = env->GetIntField(inputEvent, timestampFieldId); + const NgramContext ngramContext = JniDataUtils::constructNgramContext(env, + prevWordArray, isPrevWordBeginningOfSentenceArray, prevWordCount); + // Use 1 for count to indicate the word has inputted. + dictionary->updateEntriesForWordWithNgramContext(&ngramContext, + CodePointArrayView(wordCodePoints, wordLength), isValid, + HistoricalInfo(timestamp, 0 /* level */, 1 /* count */)); + if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) { + return i + 1; + } + env->DeleteLocalRef(prevWordArray); + env->DeleteLocalRef(isPrevWordBeginningOfSentenceArray); + env->DeleteLocalRef(inputEvent); + } + return inputEventCount; +} + +static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict, + jstring query) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return env->NewStringUTF(""); + } + const jsize queryUtf8Length = env->GetStringUTFLength(query); + char queryChars[queryUtf8Length + 1]; + env->GetStringUTFRegion(query, 0, env->GetStringLength(query), queryChars); + queryChars[queryUtf8Length] = '\0'; + static const int GET_PROPERTY_RESULT_LENGTH = 100; + char resultChars[GET_PROPERTY_RESULT_LENGTH]; + resultChars[0] = '\0'; + dictionary->getProperty(queryChars, queryUtf8Length, resultChars, GET_PROPERTY_RESULT_LENGTH); + return env->NewStringUTF(resultChars); +} + +static bool latinime_BinaryDictionary_isCorruptedNative(JNIEnv *env, jclass clazz, jlong dict) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return false; + } + return dictionary->getDictionaryStructurePolicy()->isCorrupted(); +} + +static DictionaryStructureWithBufferPolicy::StructurePolicyPtr runGCAndGetNewStructurePolicy( + DictionaryStructureWithBufferPolicy::StructurePolicyPtr structurePolicy, + const char *const dictFilePath) { + structurePolicy->flushWithGC(dictFilePath); + structurePolicy.release(); + return DictionaryStructureWithBufferPolicyFactory::newPolicyForExistingDictFile( + dictFilePath, 0 /* offset */, 0 /* size */, true /* isUpdatable */); +} + +static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, jlong dict, + jstring dictFilePath, jlong newFormatVersion) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return false; + } + const jsize filePathUtf8Length = env->GetStringUTFLength(dictFilePath); + char dictFilePathChars[filePathUtf8Length + 1]; + env->GetStringUTFRegion(dictFilePath, 0, env->GetStringLength(dictFilePath), dictFilePathChars); + dictFilePathChars[filePathUtf8Length] = '\0'; + + const DictionaryHeaderStructurePolicy *const headerPolicy = + dictionary->getDictionaryStructurePolicy()->getHeaderStructurePolicy(); + DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy = + DictionaryStructureWithBufferPolicyFactory::newPolicyForOnMemoryDict( + newFormatVersion, *headerPolicy->getLocale(), headerPolicy->getAttributeMap()); + if (!dictionaryStructureWithBufferPolicy) { + LogUtils::logToJava(env, "Cannot migrate header."); + return false; + } + + int wordCodePoints[MAX_WORD_LENGTH]; + int wordCodePointCount = 0; + int token = 0; + // Add unigrams. + do { + token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount); + const WordProperty wordProperty = dictionary->getWordProperty( + CodePointArrayView(wordCodePoints, wordCodePointCount)); + if (wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { + // Skip beginning-of-sentence unigram. + continue; + } + if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) { + dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy( + std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars); + if (!dictionaryStructureWithBufferPolicy) { + LogUtils::logToJava(env, "Cannot open dict after GC."); + return false; + } + } + if (!dictionaryStructureWithBufferPolicy->addUnigramEntry( + CodePointArrayView(wordCodePoints, wordCodePointCount), + &wordProperty.getUnigramProperty())) { + LogUtils::logToJava(env, "Cannot add unigram to the new dict."); + return false; + } + } while (token != 0); + + // Add ngrams. + do { + token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount); + const WordProperty wordProperty = dictionary->getWordProperty( + CodePointArrayView(wordCodePoints, wordCodePointCount)); + if (dictionaryStructureWithBufferPolicy->needsToRunGC(true /* mindsBlockByGC */)) { + dictionaryStructureWithBufferPolicy = runGCAndGetNewStructurePolicy( + std::move(dictionaryStructureWithBufferPolicy), dictFilePathChars); + if (!dictionaryStructureWithBufferPolicy) { + LogUtils::logToJava(env, "Cannot open dict after GC."); + return false; + } + } + for (const NgramProperty &ngramProperty : wordProperty.getNgramProperties()) { + if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&ngramProperty)) { + LogUtils::logToJava(env, "Cannot add ngram to the new dict."); + return false; + } + } + } while (token != 0); + // Save to File. + dictionaryStructureWithBufferPolicy->flushWithGC(dictFilePathChars); + return true; +} + +static const JNINativeMethod sMethods[] = { + { + const_cast("openNative"), + const_cast("(Ljava/lang/String;JJZ)J"), + reinterpret_cast(latinime_BinaryDictionary_open) + }, + { + const_cast("createOnMemoryNative"), + const_cast("(JLjava/lang/String;[Ljava/lang/String;[Ljava/lang/String;)J"), + reinterpret_cast(latinime_BinaryDictionary_createOnMemory) + }, + { + const_cast("closeNative"), + const_cast("(J)V"), + reinterpret_cast(latinime_BinaryDictionary_close) + }, + { + const_cast("getFormatVersionNative"), + const_cast("(J)I"), + reinterpret_cast(latinime_BinaryDictionary_getFormatVersion) + }, + { + const_cast("getHeaderInfoNative"), + const_cast("(J[I[ILjava/util/ArrayList;Ljava/util/ArrayList;)V"), + reinterpret_cast(latinime_BinaryDictionary_getHeaderInfo) + }, + { + const_cast("flushNative"), + const_cast("(JLjava/lang/String;)Z"), + reinterpret_cast(latinime_BinaryDictionary_flush) + }, + { + const_cast("needsToRunGCNative"), + const_cast("(JZ)Z"), + reinterpret_cast(latinime_BinaryDictionary_needsToRunGC) + }, + { + const_cast("flushWithGCNative"), + const_cast("(JLjava/lang/String;)Z"), + reinterpret_cast(latinime_BinaryDictionary_flushWithGC) + }, + { + const_cast("getSuggestionsNative"), + const_cast("(JJJ[I[I[I[I[II[I[[I[ZI[I[I[I[I[I[I[F)V"), + reinterpret_cast(latinime_BinaryDictionary_getSuggestions) + }, + { + const_cast("getProbabilityNative"), + const_cast("(J[I)I"), + reinterpret_cast(latinime_BinaryDictionary_getProbability) + }, + { + const_cast("getMaxProbabilityOfExactMatchesNative"), + const_cast("(J[I)I"), + reinterpret_cast(latinime_BinaryDictionary_getMaxProbabilityOfExactMatches) + }, + { + const_cast("getNgramProbabilityNative"), + const_cast("(J[[I[Z[I)I"), + reinterpret_cast(latinime_BinaryDictionary_getNgramProbability) + }, + { + const_cast("getWordPropertyNative"), + const_cast("(J[IZ[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;" + "Ljava/util/ArrayList;Ljava/util/ArrayList;Ljava/util/ArrayList;" + "Ljava/util/ArrayList;)V"), + reinterpret_cast(latinime_BinaryDictionary_getWordProperty) + }, + { + const_cast("getNextWordNative"), + const_cast("(JI[I[Z)I"), + reinterpret_cast(latinime_BinaryDictionary_getNextWord) + }, + { + const_cast("addUnigramEntryNative"), + const_cast("(J[II[IIZZZI)Z"), + reinterpret_cast(latinime_BinaryDictionary_addUnigramEntry) + }, + { + const_cast("removeUnigramEntryNative"), + const_cast("(J[I)Z"), + reinterpret_cast(latinime_BinaryDictionary_removeUnigramEntry) + }, + { + const_cast("addNgramEntryNative"), + const_cast("(J[[I[Z[III)Z"), + reinterpret_cast(latinime_BinaryDictionary_addNgramEntry) + }, + { + const_cast("removeNgramEntryNative"), + const_cast("(J[[I[Z[I)Z"), + reinterpret_cast(latinime_BinaryDictionary_removeNgramEntry) + }, + { + const_cast("updateEntriesForWordWithNgramContextNative"), + const_cast("(J[[I[Z[IZII)Z"), + reinterpret_cast(latinime_BinaryDictionary_updateEntriesForWordWithNgramContext) + }, + { + const_cast("updateEntriesForInputEventsNative"), + const_cast( + "(J[Lbe/scri/inputmethod/latin/utils/WordInputEventForPersonalization;I)I"), + reinterpret_cast(latinime_BinaryDictionary_updateEntriesForInputEvents) + }, + { + const_cast("getPropertyNative"), + const_cast("(JLjava/lang/String;)Ljava/lang/String;"), + reinterpret_cast(latinime_BinaryDictionary_getProperty) + }, + { + const_cast("isCorruptedNative"), + const_cast("(J)Z"), + reinterpret_cast(latinime_BinaryDictionary_isCorruptedNative) + }, + { + const_cast("migrateNative"), + const_cast("(JLjava/lang/String;J)Z"), + reinterpret_cast(latinime_BinaryDictionary_migrateNative) + } +}; + +int register_BinaryDictionary(JNIEnv *env) { + const char *const kClassPathName = "be/scri/inputmethod/latin/BinaryDictionary"; + return registerNativeMethods(env, kClassPathName, sMethods, NELEMS(sMethods)); +} +} // namespace latinime diff --git a/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionary.h b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionary.h new file mode 100644 index 000000000..2a07f9936 --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionary.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _COM_ANDROID_INPUTMETHOD_LATIN_BINARYDICTIONARY_H +#define _COM_ANDROID_INPUTMETHOD_LATIN_BINARYDICTIONARY_H + +#include "jni.h" + +namespace latinime { +int register_BinaryDictionary(JNIEnv *env); +} // namespace latinime +#endif // _COM_ANDROID_INPUTMETHOD_LATIN_BINARYDICTIONARY_H diff --git a/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.cpp b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.cpp new file mode 100644 index 000000000..5a6af2190 --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: jni: BinaryDictionaryUtils" + +#include "com_android_inputmethod_latin_BinaryDictionaryUtils.h" + +#include "defines.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "jni.h" +#include "jni_common.h" +#include "utils/autocorrection_threshold_utils.h" +#include "utils/char_utils.h" +#include "utils/jni_data_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +static jboolean latinime_BinaryDictionaryUtils_createEmptyDictFile(JNIEnv *env, jclass clazz, + jstring filePath, jlong dictVersion, jstring locale, jobjectArray attributeKeyStringArray, + jobjectArray attributeValueStringArray) { + const jsize filePathUtf8Length = env->GetStringUTFLength(filePath); + char filePathChars[filePathUtf8Length + 1]; + env->GetStringUTFRegion(filePath, 0, env->GetStringLength(filePath), filePathChars); + filePathChars[filePathUtf8Length] = '\0'; + + const jsize localeUtf8Length = env->GetStringUTFLength(locale); + char localeChars[localeUtf8Length + 1]; + env->GetStringUTFRegion(locale, 0, env->GetStringLength(locale), localeChars); + localeChars[localeUtf8Length] = '\0'; + std::vector localeCodePoints; + HeaderReadWriteUtils::insertCharactersIntoVector(localeChars, &localeCodePoints); + + const int keyCount = env->GetArrayLength(attributeKeyStringArray); + const int valueCount = env->GetArrayLength(attributeValueStringArray); + if (keyCount != valueCount) { + return false; + } + DictionaryHeaderStructurePolicy::AttributeMap attributeMap = + JniDataUtils::constructAttributeMap(env, attributeKeyStringArray, + attributeValueStringArray); + return DictFileWritingUtils::createEmptyDictFile(filePathChars, static_cast(dictVersion), + localeCodePoints, &attributeMap); +} + +static jfloat latinime_BinaryDictionaryUtils_calcNormalizedScore(JNIEnv *env, jclass clazz, + jintArray before, jintArray after, jint score) { + jsize beforeLength = env->GetArrayLength(before); + jsize afterLength = env->GetArrayLength(after); + int beforeCodePoints[beforeLength]; + int afterCodePoints[afterLength]; + env->GetIntArrayRegion(before, 0, beforeLength, beforeCodePoints); + env->GetIntArrayRegion(after, 0, afterLength, afterCodePoints); + return AutocorrectionThresholdUtils::calcNormalizedScore(beforeCodePoints, beforeLength, + afterCodePoints, afterLength, score); +} + +static int latinime_BinaryDictionaryUtils_setCurrentTimeForTest(JNIEnv *env, jclass clazz, + jint currentTime) { + if (currentTime >= 0) { + TimeKeeper::startTestModeWithForceCurrentTime(currentTime); + } else { + TimeKeeper::stopTestMode(); + } + TimeKeeper::setCurrentTime(); + return TimeKeeper::peekCurrentTime(); +} + +static const JNINativeMethod sMethods[] = { + { + const_cast("createEmptyDictFileNative"), + const_cast( + "(Ljava/lang/String;JLjava/lang/String;[Ljava/lang/String;[Ljava/lang/String;)Z"), + reinterpret_cast(latinime_BinaryDictionaryUtils_createEmptyDictFile) + }, + { + const_cast("calcNormalizedScoreNative"), + const_cast("([I[II)F"), + reinterpret_cast(latinime_BinaryDictionaryUtils_calcNormalizedScore) + }, + { + const_cast("setCurrentTimeForTestNative"), + const_cast("(I)I"), + reinterpret_cast(latinime_BinaryDictionaryUtils_setCurrentTimeForTest) + } +}; + +int register_BinaryDictionaryUtils(JNIEnv *env) { + const char *const kClassPathName = "be/scri/inputmethod/latin/utils/BinaryDictionaryUtils"; + return registerNativeMethods(env, kClassPathName, sMethods, NELEMS(sMethods)); +} +} // namespace latinime diff --git a/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.h b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.h new file mode 100644 index 000000000..38edcd20c --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_latin_BinaryDictionaryUtils.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _COM_ANDROID_INPUTMETHOD_LATIN_BINARYDICTIONARYUTILS_H +#define _COM_ANDROID_INPUTMETHOD_LATIN_BINARYDICTIONARYUTILS_H + +#include "jni.h" + +namespace latinime { +int register_BinaryDictionaryUtils(JNIEnv *env); +} // namespace latinime +#endif // _COM_ANDROID_INPUTMETHOD_LATIN_BINARYDICTIONARYUTILS_H diff --git a/app/src/main/jni/com_android_inputmethod_latin_DicTraverseSession.cpp b/app/src/main/jni/com_android_inputmethod_latin_DicTraverseSession.cpp new file mode 100644 index 000000000..caa800879 --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_latin_DicTraverseSession.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: jni: Session" + +#include "com_android_inputmethod_latin_DicTraverseSession.h" + +#include "defines.h" +#include "dictionary/property/ngram_context.h" +#include "jni.h" +#include "jni_common.h" +#include "suggest/core/session/dic_traverse_session.h" + +namespace latinime { +class Dictionary; +static jlong latinime_setDicTraverseSession(JNIEnv *env, jclass clazz, jstring localeJStr, + jlong dictSize) { + void *traverseSession = DicTraverseSession::getSessionInstance(env, localeJStr, dictSize); + return reinterpret_cast(traverseSession); +} + +static void latinime_initDicTraverseSession(JNIEnv *env, jclass clazz, jlong traverseSession, + jlong dictionary, jintArray previousWord, jint previousWordLength) { + DicTraverseSession *ts = reinterpret_cast(traverseSession); + if (!ts) { + return; + } + Dictionary *dict = reinterpret_cast(dictionary); + if (!previousWord) { + NgramContext emptyNgramContext; + ts->init(dict, &emptyNgramContext, 0 /* suggestOptions */); + return; + } + int prevWord[previousWordLength]; + env->GetIntArrayRegion(previousWord, 0, previousWordLength, prevWord); + NgramContext ngramContext(prevWord, previousWordLength, false /* isStartOfSentence */); + ts->init(dict, &ngramContext, 0 /* suggestOptions */); +} + +static void latinime_releaseDicTraverseSession(JNIEnv *env, jclass clazz, jlong traverseSession) { + DicTraverseSession *ts = reinterpret_cast(traverseSession); + DicTraverseSession::releaseSessionInstance(ts); +} + +static const JNINativeMethod sMethods[] = { + { + const_cast("setDicTraverseSessionNative"), + const_cast("(Ljava/lang/String;J)J"), + reinterpret_cast(latinime_setDicTraverseSession) + }, + { + const_cast("initDicTraverseSessionNative"), + const_cast("(JJ[II)V"), + reinterpret_cast(latinime_initDicTraverseSession) + }, + { + const_cast("releaseDicTraverseSessionNative"), + const_cast("(J)V"), + reinterpret_cast(latinime_releaseDicTraverseSession) + } +}; + +int register_DicTraverseSession(JNIEnv *env) { + const char *const kClassPathName = "be/scri/inputmethod/latin/DicTraverseSession"; + return registerNativeMethods(env, kClassPathName, sMethods, NELEMS(sMethods)); +} +} // namespace latinime diff --git a/app/src/main/jni/com_android_inputmethod_latin_DicTraverseSession.h b/app/src/main/jni/com_android_inputmethod_latin_DicTraverseSession.h new file mode 100644 index 000000000..badcbb9ea --- /dev/null +++ b/app/src/main/jni/com_android_inputmethod_latin_DicTraverseSession.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _COM_ANDROID_INPUTMETHOD_LATIN_DICTRAVERSESESSION_H +#define _COM_ANDROID_INPUTMETHOD_LATIN_DICTRAVERSESESSION_H + +#include "jni.h" + +namespace latinime { +int register_DicTraverseSession(JNIEnv *env); +} // namespace latinime +#endif // _COM_ANDROID_INPUTMETHOD_LATIN_DICTRAVERSESESSION_H diff --git a/app/src/main/jni/jni_common.cpp b/app/src/main/jni/jni_common.cpp new file mode 100644 index 000000000..ce5e30c5d --- /dev/null +++ b/app/src/main/jni/jni_common.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: jni" + +#include "jni_common.h" + +#include "com_android_inputmethod_keyboard_ProximityInfo.h" +#include "com_android_inputmethod_latin_BinaryDictionary.h" +#include "com_android_inputmethod_latin_BinaryDictionaryUtils.h" +#include "com_android_inputmethod_latin_DicTraverseSession.h" +#include "defines.h" + +/* + * Returns the JNI version on success, -1 on failure. + */ +jint JNI_OnLoad(JavaVM *vm, void *reserved) { + JNIEnv *env = 0; + + if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6) != JNI_OK) { + AKLOGE("ERROR: GetEnv failed"); + return -1; + } + ASSERT(env); + if (!env) { + AKLOGE("ERROR: JNIEnv is invalid"); + return -1; + } + if (!latinime::register_BinaryDictionary(env)) { + AKLOGE("ERROR: BinaryDictionary native registration failed"); + return -1; + } + if (!latinime::register_BinaryDictionaryUtils(env)) { + AKLOGE("ERROR: BinaryDictionaryUtils native registration failed"); + return -1; + } + if (!latinime::register_DicTraverseSession(env)) { + AKLOGE("ERROR: DicTraverseSession native registration failed"); + return -1; + } + if (!latinime::register_ProximityInfo(env)) { + AKLOGE("ERROR: ProximityInfo native registration failed"); + return -1; + } + /* success -- return valid version number */ + return JNI_VERSION_1_6; +} + +namespace latinime { +int registerNativeMethods(JNIEnv *env, const char *const className, const JNINativeMethod *methods, + const int numMethods) { + jclass clazz = env->FindClass(className); + if (!clazz) { + AKLOGE("Native registration unable to find class '%s'", className); + return JNI_FALSE; + } + if (env->RegisterNatives(clazz, methods, numMethods) != 0) { + AKLOGE("RegisterNatives failed for '%s'", className); + env->DeleteLocalRef(clazz); + return JNI_FALSE; + } + env->DeleteLocalRef(clazz); + return JNI_TRUE; +} +} // namespace latinime diff --git a/app/src/main/jni/jni_common.h b/app/src/main/jni/jni_common.h new file mode 100644 index 000000000..ef72a7ce9 --- /dev/null +++ b/app/src/main/jni/jni_common.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_JNI_COMMON_H +#define LATINIME_JNI_COMMON_H + +#include "jni.h" + +namespace latinime { +int registerNativeMethods(JNIEnv *env, const char *const className, const JNINativeMethod *methods, + const int numMethods); +} // namespace latinime +#endif // LATINIME_JNI_COMMON_H diff --git a/app/src/main/jni/run-tests.sh b/app/src/main/jni/run-tests.sh new file mode 100755 index 000000000..a7fa82d9b --- /dev/null +++ b/app/src/main/jni/run-tests.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright 2014, The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +function usage() { + echo "usage: source run-tests.sh [--host] [--target] [-h] [--help]" 1>&2 + echo " --host: run test on the host environment" 1>&2 + echo " --no-host: skip host test" 1>&2 + echo " --target: run test on the target environment" 1>&2 + echo " --no-target: skip target device test" 1>&2 +} + +# check script arguments +if [[ $(type -t mmm) != function ]]; then +usage +if [[ ${BASH_SOURCE[0]} != $0 ]]; then return; else exit 1; fi +fi + +show_usage=no +enable_host_test=yes +enable_target_device_test=no +while [ "$1" != "" ] + do + case "$1" in + "-h") show_usage=yes;; + "--help") show_usage=yes;; + "--target") enable_target_device_test=yes;; + "--no-target") enable_target_device_test=no;; + "--host") enable_host_test=yes;; + "--no-host") enable_host_test=no;; + esac + shift +done + +if [[ $show_usage == yes ]]; then + usage + if [[ ${BASH_SOURCE[0]} != $0 ]]; then return; else exit 1; fi +fi + +# Host build is never supported in unbundled (NDK/tapas) build +if [[ $enable_host_test == yes && -n $TARGET_BUILD_APPS ]]; then + echo "Host build is never supported in tapas build." 1>&2 + echo "Use lunch command instead." 1>&2 + if [[ ${BASH_SOURCE[0]} != $0 ]]; then return; else exit 1; fi +fi + +target_test_name=liblatinime_target_unittests +host_test_name=liblatinime_host_unittests + +pushd $PWD > /dev/null +cd $(gettop) +mmm -j16 packages/inputmethods/LatinIME/native/jni || \ + make -j16 adb $target_test_name $host_test_name +if [[ $enable_host_test == yes ]]; then + $ANDROID_HOST_OUT/bin/$host_test_name +fi +if [[ $enable_target_device_test == yes ]]; then + target_test_local=$ANDROID_PRODUCT_OUT/data/nativetest/$target_test_name/$target_test_name + target_test_device=/data/nativetest/$target_test_name/$target_test_name + adb push $target_test_local $target_test_device + adb shell $target_test_device + adb shell rm -rf $target_test_device +fi +popd > /dev/null diff --git a/app/src/main/jni/src/defines.h b/app/src/main/jni/src/defines.h new file mode 100644 index 000000000..1531b6cbe --- /dev/null +++ b/app/src/main/jni/src/defines.h @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DEFINES_H +#define LATINIME_DEFINES_H + +#include + +#ifdef __GNUC__ +#define AK_FORCE_INLINE __attribute__((always_inline)) __inline__ +#else // __GNUC__ +#define AK_FORCE_INLINE inline +#endif // __GNUC__ + +#if defined(FLAG_DBG) +#undef AK_FORCE_INLINE +#define AK_FORCE_INLINE inline +#endif // defined(FLAG_DBG) + +// Must be equal to Constants.Dictionary.MAX_WORD_LENGTH in Java +#define MAX_WORD_LENGTH 48 +// Must be equal to BinaryDictionary.MAX_RESULTS in Java +#define MAX_RESULTS 18 +// Must be equal to ProximityInfo.MAX_PROXIMITY_CHARS_SIZE in Java +#define MAX_PROXIMITY_CHARS_SIZE 16 +#define ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE 2 + +// TODO: Use size_t instead of int. +// Disclaimer: You will see a compile error if you use this macro against a variable-length array. +// Sorry for the inconvenience. It isn't supported. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; +#define NELEMS(x) (sizeof(ArraySizeHelper(x))) + +AK_FORCE_INLINE static int intArrayToCharArray(const int *const source, const int sourceSize, + char *dest, const int destSize) { + // We want to always terminate with a 0 char, so stop one short of the length to make + // sure there is room. + const int destLimit = destSize - 1; + int si = 0; + int di = 0; + while (si < sourceSize && di < destLimit && 0 != source[si]) { + const uint32_t codePoint = static_cast(source[si++]); + if (codePoint < 0x7F) { // One byte + dest[di++] = codePoint; + } else if (codePoint < 0x7FF) { // Two bytes + if (di + 1 >= destLimit) break; + dest[di++] = 0xC0 + (codePoint >> 6); + dest[di++] = 0x80 + (codePoint & 0x3F); + } else if (codePoint < 0xFFFF) { // Three bytes + if (di + 2 >= destLimit) break; + dest[di++] = 0xE0 + (codePoint >> 12); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = 0x80 + (codePoint & 0x3F); + } else if (codePoint <= 0x1FFFFF) { // Four bytes + if (di + 3 >= destLimit) break; + dest[di++] = 0xF0 + (codePoint >> 18); + dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = 0x80 + (codePoint & 0x3F); + } else if (codePoint <= 0x3FFFFFF) { // Five bytes + if (di + 4 >= destLimit) break; + dest[di++] = 0xF8 + (codePoint >> 24); + dest[di++] = 0x80 + ((codePoint >> 18) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = codePoint & 0x3F; + } else if (codePoint <= 0x7FFFFFFF) { // Six bytes + if (di + 5 >= destLimit) break; + dest[di++] = 0xFC + (codePoint >> 30); + dest[di++] = 0x80 + ((codePoint >> 24) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 18) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 12) & 0x3F); + dest[di++] = 0x80 + ((codePoint >> 6) & 0x3F); + dest[di++] = codePoint & 0x3F; + } else { + // Not a code point... skip. + } + } + dest[di] = 0; + return di; +} + +#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#if defined(__ANDROID__) +#include +#endif // defined(__ANDROID__) +#ifndef LOG_TAG +#define LOG_TAG "LatinIME: " +#endif // LOG_TAG + +#if defined(HOST_TOOL) +#include +#define AKLOGE(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) +#define AKLOGI(fmt, ...) printf(fmt "\n", ##__VA_ARGS__) +#else // defined(HOST_TOOL) +#define AKLOGE(fmt, ...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, fmt, ##__VA_ARGS__) +#define AKLOGI(fmt, ...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, fmt, ##__VA_ARGS__) +#endif // defined(HOST_TOOL) + +#define DUMP_SUGGESTION(words, frequencies, index, score) \ + do { dumpWordInfo(words, frequencies, index, score); } while (0) +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while (0) +#define INTS_TO_CHARS(input, length, output, outlength) do { \ + intArrayToCharArray(input, length, output, outlength); } while (0) + +static inline void dumpWordInfo(const int *word, const int length, const int rank, + const int probability) { + static char charBuf[50]; + const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf)); + if (N > 0) { + AKLOGI("%2d [ %s ] (%d)", rank, charBuf, probability); + } +} + +static AK_FORCE_INLINE void dumpWord(const int *word, const int length) { + static char charBuf[50]; + const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf)); + if (N > 1) { + AKLOGI("[ %s ]", charBuf); + } +} + +#ifndef __ANDROID__ +#include +#include +#include + +#define DO_ASSERT_TEST +#define ASSERT(success) do { if (!(success)) { showStackTrace(); assert(success);} } while (0) +#define SHOW_STACK_TRACE do { showStackTrace(); } while (0) + +static inline void showStackTrace() { + void *callstack[128]; + int i, frames = backtrace(callstack, 128); + char **strs = backtrace_symbols(callstack, frames); + for (i = 0; i < frames; ++i) { + if (i == 0) { + AKLOGI("=== Trace ==="); + continue; + } + AKLOGI("%s", strs[i]); + } + free(strs); +} +#else // __ANDROID__ +#include +#define DO_ASSERT_TEST +#define ASSERT(success) assert(success) +#define SHOW_STACK_TRACE +#endif // __ANDROID__ + +#else // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#define AKLOGE(fmt, ...) +#define AKLOGI(fmt, ...) +#define DUMP_SUGGESTION(words, frequencies, index, score) +#define DUMP_WORD(word, length) +#undef DO_ASSERT_TEST +#define ASSERT(success) +#define SHOW_STACK_TRACE +#define INTS_TO_CHARS(input, length, output) +#endif // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) + +#ifdef FLAG_DBG +#define DEBUG_DICT true +#define DEBUG_DICT_FULL false +#define DEBUG_EDIT_DISTANCE false +#define DEBUG_NODE DEBUG_DICT_FULL +#define DEBUG_TRACE DEBUG_DICT_FULL +#define DEBUG_PROXIMITY_INFO false +#define DEBUG_PROXIMITY_CHARS false +#define DEBUG_CORRECTION false +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_SAMPLING_POINTS false +#define DEBUG_POINTS_PROBABILITY false +#define DEBUG_DOUBLE_LETTER false +#define DEBUG_CACHE false +#define DEBUG_DUMP_ERROR false +#define DEBUG_EVALUATE_MOST_PROBABLE_STRING false + +#ifdef FLAG_FULL_DBG +#define DEBUG_GEO_FULL true +#else +#define DEBUG_GEO_FULL false +#endif + +#else // FLAG_DBG + +#define DEBUG_DICT false +#define DEBUG_DICT_FULL false +#define DEBUG_EDIT_DISTANCE false +#define DEBUG_NODE false +#define DEBUG_TRACE false +#define DEBUG_PROXIMITY_INFO false +#define DEBUG_PROXIMITY_CHARS false +#define DEBUG_CORRECTION false +#define DEBUG_CORRECTION_FREQ false +#define DEBUG_SAMPLING_POINTS false +#define DEBUG_POINTS_PROBABILITY false +#define DEBUG_DOUBLE_LETTER false +#define DEBUG_CACHE false +#define DEBUG_DUMP_ERROR false +#define DEBUG_EVALUATE_MOST_PROBABLE_STRING false + +#define DEBUG_GEO_FULL false + +#endif // FLAG_DBG + +#ifndef S_INT_MAX +#define S_INT_MAX 2147483647 // ((1 << 31) - 1) +#endif +#ifndef S_INT_MIN +// The literal constant -2147483648 does not work in C prior C90, because +// the compiler tries to fit the positive number into an int and then negate it. +// GCC warns about this. +#define S_INT_MIN (-2147483647 - 1) // -(1 << 31) +#endif + +#define M_PI_F 3.14159265f +#define MAX_PERCENTILE 100 + +#define NOT_A_CODE_POINT (-1) +#define NOT_A_DISTANCE (-1) +#define NOT_A_COORDINATE (-1) +#define NOT_AN_INDEX (-1) +#define NOT_A_PROBABILITY (-1) +#define NOT_A_DICT_POS (S_INT_MIN) +#define NOT_A_WORD_ID (S_INT_MIN) +#define NOT_A_TIMESTAMP (-1) +#define NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1.0f) + +// A special value to mean the first word confidence makes no sense in this case, +// e.g. this is not a multi-word suggestion. +#define NOT_A_FIRST_WORD_CONFIDENCE (S_INT_MIN) +// How high the confidence needs to be for us to auto-commit. Arbitrary. +// This needs to be the same as CONFIDENCE_FOR_AUTO_COMMIT in BinaryDictionary.java +#define CONFIDENCE_FOR_AUTO_COMMIT (1000000) +// 80% of the full confidence +#define DISTANCE_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100) +// 100% of the full confidence +#define LENGTH_WEIGHT_FOR_AUTO_COMMIT (CONFIDENCE_FOR_AUTO_COMMIT) +// 80% of the full confidence +#define SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT (80 * CONFIDENCE_FOR_AUTO_COMMIT / 100) + +#define KEYCODE_SPACE ' ' +#define KEYCODE_SINGLE_QUOTE '\'' +#define KEYCODE_HYPHEN_MINUS '-' +// Code point to indicate beginning-of-sentence. This is not in the code point space of unicode. +#define CODE_POINT_BEGINNING_OF_SENTENCE 0x110000 + +#define SUGGEST_INTERFACE_OUTPUT_SCALE 1000000.0f +#define MAX_PROBABILITY 255 +#define MAX_BIGRAM_ENCODED_PROBABILITY 15 + +// Max value for length, distance and probability which are used in weighting +// TODO: Remove +#define MAX_VALUE_FOR_WEIGHTING 10000000 + +// The max number of the keys in one keyboard layout +#define MAX_KEY_COUNT_IN_A_KEYBOARD 64 + +// TODO: Remove +#define MAX_POINTER_COUNT 1 +#define MAX_POINTER_COUNT_G 2 + +// (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram is supported. +#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 3 + +#define DISALLOW_DEFAULT_CONSTRUCTOR(TypeName) \ + TypeName() = delete + +#define DISALLOW_COPY_CONSTRUCTOR(TypeName) \ + TypeName(const TypeName&) = delete + +#define DISALLOW_ASSIGNMENT_OPERATOR(TypeName) \ + void operator=(const TypeName&) = delete + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + DISALLOW_COPY_CONSTRUCTOR(TypeName); \ + DISALLOW_ASSIGNMENT_OPERATOR(TypeName) + +#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ + DISALLOW_DEFAULT_CONSTRUCTOR(TypeName); \ + DISALLOW_COPY_AND_ASSIGN(TypeName) + +// Used as a return value for character comparison +typedef enum { + // Same char, possibly with different case or accent + MATCH_CHAR, + // It is a char located nearby on the keyboard + PROXIMITY_CHAR, + // Additional proximity char which can differ by language. + ADDITIONAL_PROXIMITY_CHAR, + // It is a substitution char + SUBSTITUTION_CHAR, + // It is an unrelated char + UNRELATED_CHAR, +} ProximityType; + +typedef enum { + NOT_A_DOUBLE_LETTER, + A_DOUBLE_LETTER, + A_STRONG_DOUBLE_LETTER +} DoubleLetterLevel; + +typedef enum { + // Correction for MATCH_CHAR + CT_MATCH, + // Correction for PROXIMITY_CHAR + CT_PROXIMITY, + // Correction for ADDITIONAL_PROXIMITY_CHAR + CT_ADDITIONAL_PROXIMITY, + // Correction for SUBSTITUTION_CHAR + CT_SUBSTITUTION, + // Skip one omitted letter + CT_OMISSION, + // Delete an unnecessarily inserted letter + CT_INSERTION, + // Swap the order of next two touch points + CT_TRANSPOSITION, + CT_COMPLETION, + CT_TERMINAL, + CT_TERMINAL_INSERTION, + // Create new word with space omission + CT_NEW_WORD_SPACE_OMISSION, + // Create new word with space substitution + CT_NEW_WORD_SPACE_SUBSTITUTION, +} CorrectionType; +#endif // LATINIME_DEFINES_H diff --git a/app/src/main/jni/src/dictionary/header/header_policy.cpp b/app/src/main/jni/src/dictionary/header/header_policy.cpp new file mode 100644 index 000000000..d4f84d39f --- /dev/null +++ b/app/src/main/jni/src/dictionary/header/header_policy.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_policy.h" + +#include + +#include "utils/ngram_utils.h" + +namespace latinime { + +// Note that these are corresponding definitions in Java side in DictionaryHeader. +const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; +const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = + "REQUIRES_GERMAN_UMLAUT_PROCESSING"; +// TODO: Change attribute string to "IS_DECAYING_DICT". +const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; +const char *const HeaderPolicy::DATE_KEY = "date"; +const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; +const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = + {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"}; +const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = + {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT", + "MAX_QUADGRAM_ENTRY_COUNT"}; +const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000}; +const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; +// Historical info is information that is needed to support decaying such as timestamp, level and +// count. +const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; +const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration +const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = + "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; + +const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; +const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; +const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; + +// Used for logging. Question mark is used to indicate that the key is not found. +void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const { + if (outValueSize <= 0) return; + if (outValueSize == 1) { + outValue[0] = '\0'; + return; + } + std::vector keyCodePointVector; + HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); + DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it = + mAttributeMap.find(keyCodePointVector); + if (it == mAttributeMap.end()) { + // The key was not found. + outValue[0] = '?'; + outValue[1] = '\0'; + return; + } + const int terminalIndex = std::min(static_cast(it->second.size()), outValueSize - 1); + for (int i = 0; i < terminalIndex; ++i) { + outValue[i] = it->second[i]; + } + outValue[terminalIndex] = '\0'; +} + +const std::vector HeaderPolicy::readLocale() const { + return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY); +} + +float HeaderPolicy::readMultipleWordCostMultiplier() const { + const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); + if (demotionRate <= 0) { + return static_cast(MAX_VALUE_FOR_WEIGHTING); + } + return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast(demotionRate); +} + +bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { + return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false); +} + +bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const { + int writingPos = 0; + DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); + fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite); + if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags, + &writingPos)) { + return false; + } + // Temporarily writes a dummy header size. + int headerSizeFieldPos = writingPos; + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite, + &writingPos)) { + return false; + } + // Writes the actual header size. + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos, + &headerSizeFieldPos)) { + return false; + } + return true; +} + +namespace { + +int getIndexFromNgramType(const NgramType ngramType) { + return static_cast(ngramType); +} + +} // namespace + +void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { + for (const auto ngramType : AllNgramTypes::ASCENDING) { + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], + entryCounts.getNgramCount(ngramType)); + } + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, + extendedRegionSize); + // Set the current time as the generation time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY, + TimeKeeper::peekCurrentTime()); + HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale); + if (updatesLastDecayedTime) { + // Set current time as the last updated time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY, + TimeKeeper::peekCurrentTime()); + } +} + +/* static */ DictionaryHeaderStructurePolicy::AttributeMap + HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); + return attributeMap; +} + +/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); + entryCounters.setNgramCount(ngramType, entryCount); + } + return entryCounters.getEntryCounts(); +} + +/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int index = getIndexFromNgramType(ngramType); + const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); + entryCounters.setNgramCount(ngramType, maxEntryCount); + } + return entryCounters.getEntryCounts(); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/header/header_policy.h b/app/src/main/jni/src/dictionary/header/header_policy.h new file mode 100644 index 000000000..47cc9196a --- /dev/null +++ b/app/src/main/jni/src/dictionary/header/header_policy.h @@ -0,0 +1,268 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_POLICY_H +#define LATINIME_HEADER_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/format_utils.h" +#include "utils/char_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class HeaderPolicy : public DictionaryHeaderStructurePolicy { + public: + // Reads information from existing dictionary buffer. + HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion) + : mDictFormatVersion(formatVersion), + mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), + mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), + mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), + mLocale(readLocale()), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} + + // Constructs header information using an attribute map. + HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, + const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) + : mDictFormatVersion(dictFormatVersion), + mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), + mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + IS_DECAYING_DICT_KEY, false /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(0), + mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( + &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), + mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, + DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} + + // Copy header information + HeaderPolicy(const HeaderPolicy *const headerPolicy) + : mDictFormatVersion(headerPolicy->mDictFormatVersion), + mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize), + mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale), + mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), + mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), + mIsDecayingDict(headerPolicy->mIsDecayingDict), + mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), + mNgramCounts(headerPolicy->mNgramCounts), + mMaxNgramCounts(headerPolicy->mMaxNgramCounts), + mExtendedRegionSize(headerPolicy->mExtendedRegionSize), + mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), + mForgettingCurveProbabilityValuesTableId( + headerPolicy->mForgettingCurveProbabilityValuesTableId), + mCodePointTable(headerPolicy->mCodePointTable) {} + + // Temporary dummy header. + HeaderPolicy() + : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), + mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), + mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), + mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), + mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), + mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} + + ~HeaderPolicy() {} + + virtual int getFormatVersionNumber() const { + // Conceptually this converts the symbolic value we use in the code into the + // hardcoded of the bytes in the file. But we want the constants to be the + // same so we use them for both here. + switch (mDictFormatVersion) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return FormatUtils::UNKNOWN_VERSION; + case FormatUtils::VERSION_202: + return FormatUtils::VERSION_202; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + return FormatUtils::VERSION_4_ONLY_FOR_TESTING; + case FormatUtils::VERSION_402: + return FormatUtils::VERSION_402; + case FormatUtils::VERSION_403: + return FormatUtils::VERSION_403; + default: + return FormatUtils::UNKNOWN_VERSION; + } + } + + AK_FORCE_INLINE bool isValid() const { + // Decaying dictionary must have historical information. + if (!mIsDecayingDict) { + return true; + } + if (mHasHistoricalInfoOfWords) { + return true; + } else { + return false; + } + } + + AK_FORCE_INLINE int getSize() const { + return mSize; + } + + AK_FORCE_INLINE float getMultiWordCostMultiplier() const { + return mMultiWordCostMultiplier; + } + + AK_FORCE_INLINE bool isDecayingDict() const { + return mIsDecayingDict; + } + + AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { + return mRequiresGermanUmlautProcessing; + } + + AK_FORCE_INLINE int getDate() const { + return mDate; + } + + AK_FORCE_INLINE int getLastDecayedTime() const { + return mLastDecayedTime; + } + + AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { + return mNgramCounts; + } + + AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { + return mMaxNgramCounts; + } + + AK_FORCE_INLINE int getExtendedRegionSize() const { + return mExtendedRegionSize; + } + + AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { + return mHasHistoricalInfoOfWords; + } + + AK_FORCE_INLINE bool shouldBoostExactMatches() const { + // TODO: Investigate better ways to handle exact matches for personalized dictionaries. + return !isDecayingDict(); + } + + const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const { + return &mAttributeMap; + } + + AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { + return mForgettingCurveProbabilityValuesTableId; + } + + void readHeaderValueOrQuestionMark(const char *const key, + int *outValue, int outValueSize) const; + + bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const; + + void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, + const int extendedRegionSize, + DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; + + AK_FORCE_INLINE const std::vector *getLocale() const { + return &mLocale; + } + + bool supportsBeginningOfSentence() const { + return mDictFormatVersion >= FormatUtils::VERSION_402; + } + + const int *getCodePointTable() const { + return mCodePointTable; + } + + private: + DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); + + static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; + static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; + static const char *const IS_DECAYING_DICT_KEY; + static const char *const DATE_KEY; + static const char *const LAST_DECAYED_TIME_KEY; + static const char *const NGRAM_COUNT_KEYS[]; + static const char *const MAX_NGRAM_COUNT_KEYS[]; + static const int DEFAULT_MAX_NGRAM_COUNTS[]; + static const char *const EXTENDED_REGION_SIZE_KEY; + static const char *const HAS_HISTORICAL_INFO_KEY; + static const char *const LOCALE_KEY; + static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; + static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; + static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; + static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; + static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; + static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; + + const FormatUtils::FORMAT_VERSION mDictFormatVersion; + const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; + const int mSize; + DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; + const std::vector mLocale; + const float mMultiWordCostMultiplier; + const bool mRequiresGermanUmlautProcessing; + const bool mIsDecayingDict; + const int mDate; + const int mLastDecayedTime; + const EntryCounts mNgramCounts; + const EntryCounts mMaxNgramCounts; + const int mExtendedRegionSize; + const bool mHasHistoricalInfoOfWords; + const int mForgettingCurveProbabilityValuesTableId; + const int *const mCodePointTable; + + const std::vector readLocale() const; + float readMultipleWordCostMultiplier() const; + bool readRequiresGermanUmlautProcessing() const; + const EntryCounts readNgramCounts() const; + const EntryCounts readMaxNgramCounts() const; + static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( + const uint8_t *const dictBuf); +}; +} // namespace latinime +#endif /* LATINIME_HEADER_POLICY_H */ diff --git a/app/src/main/jni/src/dictionary/header/header_read_write_utils.cpp b/app/src/main/jni/src/dictionary/header/header_read_write_utils.cpp new file mode 100644 index 000000000..779f8b8c3 --- /dev/null +++ b/app/src/main/jni/src/dictionary/header/header_read_write_utils.cpp @@ -0,0 +1,248 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_read_write_utils.h" + +#include +#include +#include +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// Number of base-10 digits in the largest integer + 1 to leave room for a zero terminator. +// As such, this is the maximum number of characters will be needed to represent an int as a +// string, including the terminator; this is used as the size of a string buffer large enough to +// hold any value that is intended to fit in an integer, e.g. in the code that reads the header +// of the binary dictionary where a {key,value} string pair scheme is used. +const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; + +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048; + +const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; +const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; +const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable"; + +const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; + +typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; + +/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { + // See the format of the header in the comment in + // BinaryDictionaryFormatUtils::detectFormatVersion() + return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE + + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) { + return ByteArrayUtils::readUint16(dictBuf, + HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + const AttributeMap *const attributeMap) { + return NO_FLAGS; +} + +/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes) { + const int headerSize = getHeaderSize(dictBuf); + int pos = getHeaderOptionsPosition(); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return; + } + int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; + std::unique_ptr valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]); + while (pos < headerSize) { + // The values in the header don't use the code point table for their encoding. + const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos); + std::vector key; + key.insert(key.end(), keyBuffer, keyBuffer + keyLength); + const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos); + std::vector value; + value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength); + headerAttributes->insert(AttributeMap::value_type(key, value)); + } +} + +/* static */ const int *HeaderReadWriteUtils::readCodePointTable( + AttributeMap *const headerAttributes) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return nullptr; + } + return it->second.data(); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( + BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, + int *const writingPos) { + if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE, + writingPos)) { + return false; + } + switch (version) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + case FormatUtils::VERSION_202: + // None of the static dictionaries (v2x) support writing + return false; + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: + return buffer->writeUintAndAdvancePosition(version /* data */, + HEADER_DICTIONARY_VERSION_SIZE, writingPos); + default: + return false; + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags( + BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags, + int *const writingPos) { + return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize( + BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) { + return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes( + BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes, + int *const writingPos) { + for (AttributeMap::const_iterator it = headerAttributes->begin(); + it != headerAttributes->end(); ++it) { + if (it->first.empty() || it->second.empty()) { + continue; + } + // Write a key. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + // Write a value. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + } + return true; +} + +/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute( + AttributeMap *const headerAttributes, const char *const key, + const std::vector &value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + (*headerAttributes)[keyVector] = value; +} + +/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, + const char *const key, const bool value) { + setIntAttribute(headerAttributes, key, value ? 1 : 0); +} + +/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, + const char *const key, const int value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + setIntAttributeInner(headerAttributes, &keyVector, value); +} + +/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value) { + AttributeMap::mapped_type valueVector; + char charBuf[LARGEST_INT_DIGIT_COUNT]; + snprintf(charBuf, sizeof(charBuf), "%d", value); + insertCharactersIntoVector(charBuf, &valueVector); + (*headerAttributes)[*key] = valueVector; +} + +/* static */ const std::vector HeaderReadWriteUtils::readCodePointVectorAttributeValue( + const AttributeMap *const headerAttributes, const char *const key) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return std::vector(); + } else { + return it->second; + } +} + +/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const bool defaultValue) { + const int intDefaultValue = defaultValue ? 1 : 0; + const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); + return intValue != 0; +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const int defaultValue) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue); +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const int defaultValue) { + AttributeMap::const_iterator it = headerAttributes->find(*key); + if (it != headerAttributes->end()) { + int value = 0; + bool isNegative = false; + for (size_t i = 0; i < it->second.size(); ++i) { + if (i == 0 && it->second.at(i) == '-') { + isNegative = true; + } else { + if (!isdigit(it->second.at(i))) { + // If not a number. + return defaultValue; + } + value *= 10; + value += it->second.at(i) - '0'; + } + } + return isNegative ? -value : value; + } + return defaultValue; +} + +/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters, + std::vector *const vector) { + for (int i = 0; characters[i]; ++i) { + vector->push_back(characters[i]); + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/header/header_read_write_utils.h b/app/src/main/jni/src/dictionary/header/header_read_write_utils.h new file mode 100644 index 000000000..f67d614df --- /dev/null +++ b/app/src/main/jni/src/dictionary/header/header_read_write_utils.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H +#define LATINIME_HEADER_READ_WRITE_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class HeaderReadWriteUtils { + public: + typedef uint16_t DictionaryFlags; + + static int getHeaderSize(const uint8_t *const dictBuf); + + static DictionaryFlags getFlags(const uint8_t *const dictBuf); + + static AK_FORCE_INLINE int getHeaderOptionsPosition() { + return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE + + HEADER_SIZE_FIELD_SIZE; + } + + static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + + static const int *readCodePointTable( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, + const FormatUtils::FORMAT_VERSION version, int *const writingPos); + + static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer, + const DictionaryFlags flags, int *const writingPos); + + static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer, + const int size, int *const writingPos); + + static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + int *const writingPos); + + /** + * Methods for header attributes. + */ + static void setCodePointVectorAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const std::vector &value); + + static void setBoolAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const bool value); + + static void setIntAttribute( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const int value); + + static const std::vector readCodePointVectorAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key); + + static bool readBoolAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const bool defaultValue); + + static int readIntAttributeValue( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const char *const key, const int defaultValue); + + static void insertCharactersIntoVector(const char *const characters, + DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); + + static const int LARGEST_INT_DIGIT_COUNT; + static const int MAX_ATTRIBUTE_KEY_LENGTH; + static const int MAX_ATTRIBUTE_VALUE_LENGTH; + + static const int HEADER_MAGIC_NUMBER_SIZE; + static const int HEADER_DICTIONARY_VERSION_SIZE; + static const int HEADER_FLAG_SIZE; + static const int HEADER_SIZE_FIELD_SIZE; + + static const char *const CODE_POINT_TABLE_KEY; + + // Value for the "flags" field. It's unused at the moment. + static const DictionaryFlags NO_FLAGS; + + static void setIntAttributeInner( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int value); + + static int readIntAttributeValueInner( + const DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, + const DictionaryHeaderStructurePolicy::AttributeMap::key_type *const key, + const int defaultValue); +}; +} +#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h b/app/src/main/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h new file mode 100644 index 000000000..aa0d068aa --- /dev/null +++ b/app/src/main/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of bigrams. + */ +class DictionaryBigramsStructurePolicy { + public: + virtual ~DictionaryBigramsStructurePolicy() {} + + virtual void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const pos) const = 0; + virtual bool skipAllBigrams(int *const pos) const = 0; + + protected: + DictionaryBigramsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryBigramsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H */ diff --git a/app/src/main/jni/src/dictionary/interface/dictionary_header_structure_policy.h b/app/src/main/jni/src/dictionary/interface/dictionary_header_structure_policy.h new file mode 100644 index 000000000..6da390e55 --- /dev/null +++ b/app/src/main/jni/src/dictionary/interface/dictionary_header_structure_policy.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H + +#include +#include + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryHeaderStructurePolicy { + public: + typedef std::map, std::vector> AttributeMap; + + virtual ~DictionaryHeaderStructurePolicy() {} + + virtual int getFormatVersionNumber() const = 0; + + virtual int getSize() const = 0; + + virtual const AttributeMap *getAttributeMap() const = 0; + + virtual bool requiresGermanUmlautProcessing() const = 0; + + virtual float getMultiWordCostMultiplier() const = 0; + + virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const = 0; + + virtual bool shouldBoostExactMatches() const = 0; + + virtual const std::vector *getLocale() const = 0; + + virtual bool supportsBeginningOfSentence() const = 0; + + protected: + DictionaryHeaderStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryHeaderStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H */ diff --git a/app/src/main/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h b/app/src/main/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h new file mode 100644 index 000000000..40b6c2de1 --- /dev/null +++ b/app/src/main/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of shortcuts. + */ +class DictionaryShortcutsStructurePolicy { + public: + virtual ~DictionaryShortcutsStructurePolicy() {} + + virtual int getStartPos(const int pos) const = 0; + + virtual void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const = 0; + + virtual void skipAllShortcuts(int *const pos) const = 0; + + protected: + DictionaryShortcutsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryShortcutsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H */ diff --git a/app/src/main/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h b/app/src/main/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h new file mode 100644 index 000000000..ace48491d --- /dev/null +++ b/app/src/main/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/word_attributes.h" +#include "dictionary/property/word_property.h" +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; +class DictionaryHeaderStructurePolicy; +class MultiBigramMap; +class NgramListener; +class NgramContext; +class UnigramProperty; + +/* + * This class abstracts the structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryStructureWithBufferPolicy { + public: + typedef std::unique_ptr StructurePolicyPtr; + + virtual ~DictionaryStructureWithBufferPolicy() {} + + virtual int getRootPosition() const = 0; + + virtual void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const = 0; + + virtual int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const = 0; + + virtual int getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const = 0; + + virtual const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const = 0; + + // TODO: Remove + virtual int getProbability(const int unigramProbability, const int bigramProbability) const = 0; + + virtual int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const = 0; + + virtual void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const = 0; + + virtual BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const = 0; + + virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0; + + // Returns whether the update was success or not. + virtual bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) = 0; + + // Returns whether the update was success or not. + virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0; + + // Returns whether the update was success or not. + virtual bool addNgramEntry(const NgramProperty *const ngramProperty) = 0; + + // Returns whether the update was success or not. + virtual bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) = 0; + + // Returns whether the update was success or not. + virtual bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) = 0; + + // Returns whether the flush was success or not. + virtual bool flush(const char *const filePath) = 0; + + // Returns whether the GC and flush were success or not. + virtual bool flushWithGC(const char *const filePath) = 0; + + virtual bool needsToRunGC(const bool mindsBlockByGC) const = 0; + + // Currently, this method is used only for testing. You may want to consider creating new + // dedicated method instead of this if you want to use this in the production. + virtual void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength) = 0; + + virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0; + + // Method to iterate all words in the dictionary. + // The returned token has to be used to get the next word. If token is 0, this method newly + // starts iterating the dictionary. + virtual int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) = 0; + + virtual bool isCorrupted() const = 0; + + protected: + DictionaryStructureWithBufferPolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryStructureWithBufferPolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */ diff --git a/app/src/main/jni/src/dictionary/interface/ngram_listener.h b/app/src/main/jni/src/dictionary/interface/ngram_listener.h new file mode 100644 index 000000000..2eb5e9fd1 --- /dev/null +++ b/app/src/main/jni/src/dictionary/interface/ngram_listener.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_LISTENER_H +#define LATINIME_NGRAM_LISTENER_H + +#include "defines.h" + +namespace latinime { + +/** + * Interface to iterate ngram entries. + */ +class NgramListener { + public: + // ngramProbability is always 0 for v403 decaying dictionary. + // TODO: Remove ngramProbability. + virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0; + virtual ~NgramListener() {}; + + protected: + NgramListener() {} + + private: + DISALLOW_COPY_AND_ASSIGN(NgramListener); + +}; +} // namespace latinime +#endif /* LATINIME_NGRAM_LISTENER_H */ diff --git a/app/src/main/jni/src/dictionary/property/historical_info.h b/app/src/main/jni/src/dictionary/property/historical_info.h new file mode 100644 index 000000000..e5ce1ea25 --- /dev/null +++ b/app/src/main/jni/src/dictionary/property/historical_info.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HISTORICAL_INFO_H +#define LATINIME_HISTORICAL_INFO_H + +#include "defines.h" + +namespace latinime { + +class HistoricalInfo { + public: + // Invalid historical info. + HistoricalInfo() + : mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0) {} + + HistoricalInfo(const int timestamp, const int level, const int count) + : mTimestamp(timestamp), mLevel(level), mCount(count) {} + + bool isValid() const { + return mTimestamp != NOT_A_TIMESTAMP; + } + + int getTimestamp() const { + return mTimestamp; + } + + // TODO: Remove + int getLevel() const { + return mLevel; + } + + int getCount() const { + return mCount; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_ASSIGNMENT_OPERATOR(HistoricalInfo); + + const int mTimestamp; + const int mLevel; + const int mCount; +}; +} // namespace latinime +#endif /* LATINIME_HISTORICAL_INFO_H */ diff --git a/app/src/main/jni/src/dictionary/property/ngram_context.cpp b/app/src/main/jni/src/dictionary/property/ngram_context.cpp new file mode 100644 index 000000000..7b9c3eff6 --- /dev/null +++ b/app/src/main/jni/src/dictionary/property/ngram_context.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/property/ngram_context.h" + +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +NgramContext::NgramContext() : mPrevWordCount(0) {} + +NgramContext::NgramContext(const NgramContext &ngramContext) + : mPrevWordCount(ngramContext.mPrevWordCount) { + for (size_t i = 0; i < mPrevWordCount; ++i) { + mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i]; + memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); + mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i]; + } +} + +NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], + const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, + const size_t prevWordCount) + : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) { + clear(); + for (size_t i = 0; i < mPrevWordCount; ++i) { + if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { + continue; + } + memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); + mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; + mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; + } +} + +NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, + const bool isBeginningOfSentence) : mPrevWordCount(1) { + clear(); + if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { + return; + } + memmove(mPrevWordCodePoints[0], prevWordCodePoints, + sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); + mPrevWordCodePointCount[0] = prevWordCodePointCount; + mIsBeginningOfSentence[0] = isBeginningOfSentence; +} + +bool NgramContext::isValid() const { + if (mPrevWordCodePointCount[0] > 0) { + return true; + } + if (mIsBeginningOfSentence[0]) { + return true; + } + return false; +} + +const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const { + if (n <= 0 || n > mPrevWordCount) { + return CodePointArrayView(); + } + return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]); +} + +bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const { + if (n <= 0 || n > mPrevWordCount) { + return false; + } + return mIsBeginningOfSentence[n - 1]; +} + +/* static */ int NgramContext::getWordId( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { + return NOT_A_WORD_ID; + } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount, + MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_WORD_ID; + } + } + const CodePointArrayView codePointArrayView(codePoints, codePointCount); + const int wordId = dictStructurePolicy->getWordId(codePointArrayView, + false /* forceLowerCaseSearch */); + if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) { + // Return the id when when the word was found or doesn't try lower case search. + return wordId; + } + // Check bigrams for lower-cased previous word if original was not found. Useful for + // auto-capitalized words like "The [current_word]". + return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */); +} + +void NgramContext::clear() { + for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { + mPrevWordCodePointCount[i] = 0; + mIsBeginningOfSentence[i] = false; + } +} +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/property/ngram_context.h b/app/src/main/jni/src/dictionary/property/ngram_context.h new file mode 100644 index 000000000..9b36199c9 --- /dev/null +++ b/app/src/main/jni/src/dictionary/property/ngram_context.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_CONTEXT_H +#define LATINIME_NGRAM_CONTEXT_H + +#include + +#include "defines.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; + +class NgramContext { + public: + // No prev word information. + NgramContext(); + // Copy constructor to use this class with std::vector and use this class as a return value. + NgramContext(const NgramContext &ngramContext); + // Construct from previous words. + NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], + const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, + const size_t prevWordCount); + // Construct from a previous word. + NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, + const bool isBeginningOfSentence); + + size_t getPrevWordCount() const { + return mPrevWordCount; + } + bool isValid() const; + + template + const WordIdArrayView getPrevWordIds( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + WordIdArray *const prevWordIdBuffer, const bool tryLowerCaseSearch) const { + for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) { + prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy, mPrevWordCodePoints[i], + mPrevWordCodePointCount[i], mIsBeginningOfSentence[i], tryLowerCaseSearch); + } + return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount); + } + + // n is 1-indexed. + const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const; + // n is 1-indexed. + bool isNthPrevWordBeginningOfSentence(const size_t n) const; + + private: + DISALLOW_ASSIGNMENT_OPERATOR(NgramContext); + + static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence, const bool tryLowerCaseSearch); + void clear(); + + const size_t mPrevWordCount; + int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; +}; +} // namespace latinime +#endif // LATINIME_NGRAM_CONTEXT_H diff --git a/app/src/main/jni/src/dictionary/property/ngram_property.h b/app/src/main/jni/src/dictionary/property/ngram_property.h new file mode 100644 index 000000000..5f259ec59 --- /dev/null +++ b/app/src/main/jni/src/dictionary/property/ngram_property.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_PROPERTY_H +#define LATINIME_NGRAM_PROPERTY_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/ngram_context.h" + +namespace latinime { + +class NgramProperty { + public: + NgramProperty(const NgramContext &ngramContext, const std::vector &&targetCodePoints, + const int probability, const HistoricalInfo historicalInfo) + : mNgramContext(ngramContext), mTargetCodePoints(std::move(targetCodePoints)), + mProbability(probability), mHistoricalInfo(historicalInfo) {} + + const NgramContext *getNgramContext() const { + return &mNgramContext; + } + + const std::vector *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo getHistoricalInfo() const { + return mHistoricalInfo; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty); + DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty); + + const NgramContext mNgramContext; + const std::vector mTargetCodePoints; + const int mProbability; + const HistoricalInfo mHistoricalInfo; +}; +} // namespace latinime +#endif // LATINIME_NGRAM_PROPERTY_H diff --git a/app/src/main/jni/src/dictionary/property/unigram_property.h b/app/src/main/jni/src/dictionary/property/unigram_property.h new file mode 100644 index 000000000..92f61b85d --- /dev/null +++ b/app/src/main/jni/src/dictionary/property/unigram_property.h @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_UNIGRAM_PROPERTY_H +#define LATINIME_UNIGRAM_PROPERTY_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" + +namespace latinime { + +class UnigramProperty { + public: + class ShortcutProperty { + public: + ShortcutProperty(const std::vector &&targetCodePoints, const int probability) + : mTargetCodePoints(std::move(targetCodePoints)), + mProbability(probability) {} + + const std::vector *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty); + + const std::vector mTargetCodePoints; + const int mProbability; + }; + + UnigramProperty() + : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), + mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY), + mHistoricalInfo(), mShortcuts() {} + + // In contexts which do not support the Blacklisted flag (v2, v4<403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo, const std::vector &&shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(false), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} + + // Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(false), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts() {} + + // In contexts which DO support the Blacklisted flag (v403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo, const std::vector &&shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} + + // Without shortcuts, in contexts which DO support the Blacklisted flag (v403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts() {} + + bool representsBeginningOfSentence() const { + return mRepresentsBeginningOfSentence; + } + + bool isNotAWord() const { + return mIsNotAWord; + } + + bool isPossiblyOffensive() const { + return mIsPossiblyOffensive; + } + + bool isBlacklisted() const { + return mIsBlacklisted; + } + + bool hasShortcuts() const { + return !mShortcuts.empty(); + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo getHistoricalInfo() const { + return mHistoricalInfo; + } + + const std::vector &getShortcuts() const { + return mShortcuts; + } + + private: + // Default copy constructor is used for using as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); + + const bool mRepresentsBeginningOfSentence; + const bool mIsNotAWord; + const bool mIsBlacklisted; + const bool mIsPossiblyOffensive; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + const std::vector mShortcuts; +}; +} // namespace latinime +#endif // LATINIME_UNIGRAM_PROPERTY_H diff --git a/app/src/main/jni/src/dictionary/property/word_attributes.h b/app/src/main/jni/src/dictionary/property/word_attributes.h new file mode 100644 index 000000000..5351e7d7d --- /dev/null +++ b/app/src/main/jni/src/dictionary/property/word_attributes.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_ATTRIBUTES_H +#define LATINIME_WORD_ATTRIBUTES_H + +#include "defines.h" + +class WordAttributes { + public: + // Invalid word attributes. + WordAttributes() + : mProbability(NOT_A_PROBABILITY), mIsBlacklisted(false), mIsNotAWord(false), + mIsPossiblyOffensive(false) {} + + WordAttributes(const int probability, const bool isBlacklisted, const bool isNotAWord, + const bool isPossiblyOffensive) + : mProbability(probability), mIsBlacklisted(isBlacklisted), mIsNotAWord(isNotAWord), + mIsPossiblyOffensive(isPossiblyOffensive) {} + + int getProbability() const { + return mProbability; + } + + bool isBlacklisted() const { + return mIsBlacklisted; + } + + bool isNotAWord() const { + return mIsNotAWord; + } + + // Whether or not a word is possibly offensive. + // * Static dictionaries =v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag. + // * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model + // flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero) + // + // See the ::getWordAttributes function for each of these dictionary policies for more details. + bool isPossiblyOffensive() const { + return mIsPossiblyOffensive; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(WordAttributes); + + int mProbability; + bool mIsBlacklisted; + bool mIsNotAWord; + bool mIsPossiblyOffensive; +}; + + // namespace +#endif /* LATINIME_WORD_ATTRIBUTES_H */ diff --git a/app/src/main/jni/src/dictionary/property/word_property.h b/app/src/main/jni/src/dictionary/property/word_property.h new file mode 100644 index 000000000..3028e020a --- /dev/null +++ b/app/src/main/jni/src/dictionary/property/word_property.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_PROPERTY_H +#define LATINIME_WORD_PROPERTY_H + +#include + +#include "defines.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "utils/int_array_view.h" + +namespace latinime { + +// This class is used for returning information belonging to a word to java side. +class WordProperty { + public: + // Default constructor is used to create an instance that indicates an invalid word. + WordProperty() + : mCodePoints(), mUnigramProperty(), mNgrams() {} + + WordProperty(const std::vector &&codePoints, const UnigramProperty &unigramProperty, + const std::vector &ngrams) + : mCodePoints(std::move(codePoints)), mUnigramProperty(unigramProperty), + mNgrams(ngrams) {} + + const CodePointArrayView getCodePoints() const { + return CodePointArrayView(mCodePoints); + } + + const UnigramProperty &getUnigramProperty() const { + return mUnigramProperty; + } + + const std::vector &getNgramProperties() const { + return mNgrams; + } + + private: + // Default copy constructor is used for using as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); + + const std::vector mCodePoints; + const UnigramProperty mUnigramProperty; + const std::vector mNgrams; +}; +} // namespace latinime +#endif // LATINIME_WORD_PROPERTY_H diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/Readme.txt b/app/src/main/jni/src/dictionary/structure/backward/v402/Readme.txt new file mode 100644 index 000000000..9e29e836c --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/Readme.txt @@ -0,0 +1 @@ +Files under this directory have been auto generated. diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp new file mode 100644 index 000000000..60749bce6 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp @@ -0,0 +1,289 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp + */ + +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos); + if (outBigramPos) { + // Lookup target PtNode position. + *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + } + if (outProbability) { + if (bigramEntry.hasHistoricalInfo()) { + *outProbability = + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(), + mHeaderPolicy); + } else { + *outProbability = bigramEntry.getProbability(); + } + } + if (outHasNext) { + *outHasNext = bigramEntry.hasNext(); + } +} + +bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { + // 1. The word has no bigrams yet. + // 2. The word has bigrams, and there is the target in the list. + // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. + // 4. The word has bigrams. We have to append new bigram entry to the list. + // 5. Same as 4, but the list is the last entry of the content file. + if (outAddedNewEntry) { + *outAddedNewEntry = false; + } + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Case 1. PtNode that doesn't have a bigram list. + // Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, + ngramProperty); + // Write an entry. + const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; + } + + int tailEntryPos = NOT_A_DICT_POS; + const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos, + &tailEntryPos); + if (tailEntryPos != NOT_A_DICT_POS || entryPosToUpdate == NOT_A_DICT_POS) { + // Case 4, 5. + // Add new entry to the bigram list. + if (tailEntryPos == NOT_A_DICT_POS) { + // Case 4. Create new bigram list. + if (!mBigramDictContent->createNewBigramList(terminalId)) { + return false; + } + const int destPos = mBigramDictContent->getBigramListHeadPos(terminalId); + // Copy existing bigram list. + if (!mBigramDictContent->copyBigramList(bigramListPos, destPos, &tailEntryPos)) { + return false; + } + } + // Write new entry at the tail position of the bigram content. + const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &newBigramEntry, ngramProperty); + if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) { + return false; + } + // Update has next flag of the tail entry. + if (!updateHasNextFlag(true /* hasNext */, tailEntryPos)) { + return false; + } + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + return true; + } + + // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry. + const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (!originalBigramEntry.isValid()) { + // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing + // entry is updated. + if (outAddedNewEntry) { + *outAddedNewEntry = true; + } + } + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); + const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( + &updatedBigramEntry, ngramProperty); + return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); +} + +bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return false; + } + const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos, + nullptr /* outTailEntryPos */); + if (entryPosToUpdate == NOT_A_DICT_POS) { + // Bigram entry doesn't exist. + return false; + } + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); + if (targetTerminalId != bigramEntry.getTargetTerminalId()) { + // Bigram entry doesn't exist. + return false; + } + // Remove bigram entry by marking it as invalid entry and overwriting the original entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate); +} + +bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return true; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition( + bigramEntry.getTargetTerminalId()); + if (targetPtNodePos == NOT_A_DICT_POS) { + // Invalidate bigram entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } else if (bigramEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + bigramEntry.getHistoricalInfo(), mHeaderPolicy); + if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) { + const BigramEntry updatedBigramEntry = + bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + *outBigramCount += 1; + } else { + // Remove entry. + const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { + return false; + } + } + } else { + *outBigramCount += 1; + } + } + return true; +} + +int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { + const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); + if (bigramListPos == NOT_A_DICT_POS) { + // Bigram list doesn't exist. + return 0; + } + int bigramCount = 0; + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.isValid()) { + bigramCount++; + } + } + return bigramCount; +} + +int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, + const int bigramListPos, int *const outTailEntryPos) const { + if (outTailEntryPos) { + *outTailEntryPos = NOT_A_DICT_POS; + } + bool hasNext = true; + int invalidEntryPos = NOT_A_DICT_POS; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) { + // Entry with same target is found. + return entryPos; + } else if (!bigramEntry.isValid()) { + // Invalid entry that can be reused is found. + invalidEntryPos = entryPos; + } + if (!hasNext && mBigramDictContent->isContentTailPos(readingPos)) { + if (outTailEntryPos) { + *outTailEntryPos = entryPos; + } + } + } + return invalidEntryPos; +} + +const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( + const BigramEntry *const originalBigramEntry, + const NgramProperty *const ngramProperty) const { + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo &historicalInfoForUpdate = ngramProperty->getHistoricalInfo(); + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalBigramEntry->getHistoricalInfo(), ngramProperty->getProbability(), + &historicalInfoForUpdate, mHeaderPolicy); + return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); + } else { + return originalBigramEntry->updateProbabilityAndGetEntry(ngramProperty->getProbability()); + } +} + +bool Ver4BigramListPolicy::updateHasNextFlag(const bool hasNext, const int bigramEntryPos) { + const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(bigramEntryPos); + const BigramEntry updatedBigramEntry = bigramEntry.updateHasNextAndGetEntry(hasNext); + return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, bigramEntryPos); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h b/app/src/main/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h new file mode 100644 index 000000000..58c88ce8a --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/structure/backward/v402/content/bigram_entry.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramDictContent; +} // namespace v402 +} // namespace backward +class NgramProperty; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class TerminalPositionLookupTable; + +class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable, + const HeaderPolicy *const headerPolicy) + : mBigramDictContent(bigramDictContent), + mTerminalPositionLookupTable(terminalPositionLookupTable), + mHeaderPolicy(headerPolicy) {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const; + + bool skipAllBigrams(int *const pos) const { + // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. + return true; + } + + bool addNewEntry(const int terminalId, const int newTargetTerminalId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + bool removeEntry(const int terminalId, const int targetTerminalId); + + bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, + int *const outBigramCount); + + int getBigramEntryConut(const int terminalId); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); + + int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos, + int *const outTailEntryPos) const; + + const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, + const NgramProperty *const ngramProperty) const; + + bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos); + + BigramDictContent *const mBigramDictContent; + const TerminalPositionLookupTable *const mTerminalPositionLookupTable; + const HeaderPolicy *const mHeaderPolicy; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp new file mode 100644 index 000000000..7fa85dec2 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/bigram_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( + int *const bigramEntryPos) const { + const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); + const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); + if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { + AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " + "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, + bigramListBuffer->getTailPosition()); + ASSERT(false); + return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, + Ver4DictConstants::NOT_A_TERMINAL_ID); + } + const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos); + const bool hasNext = (bigramFlags & Ver4DictConstants::BIGRAM_HAS_NEXT_MASK) != 0; + int probability = NOT_A_PROBABILITY; + int timestamp = NOT_A_TIMESTAMP; + int level = 0; + int count = 0; + if (mHasHistoricalInfo) { + timestamp = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos); + level = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos); + count = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos); + } else { + probability = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); + } + const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos); + const int targetTerminalId = + (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? + Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; + if (mHasHistoricalInfo) { + // Hack for better migration. + count += level; + const HistoricalInfo historicalInfo(timestamp, level, count); + return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId); + } else { + return BigramEntry(hasNext, probability, targetTerminalId); + } +} + +bool BigramDictContent::writeBigramEntryAndAdvancePosition( + const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) { + BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer(); + const int bigramFlags = createAndGetBigramFlags(bigramEntryToWrite->hasNext()); + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags, + Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags); + return false; + } + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo(); + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos, + historicalInfo->getTimestamp()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos, + historicalInfo->getLevel()); + return false; + } + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos, + historicalInfo->getCount()); + return false; + } + } else { + if (!bigramListBuffer->writeUintAndAdvancePosition(bigramEntryToWrite->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, + bigramEntryToWrite->getProbability()); + return false; + } + } + const int targetTerminalIdToWrite = + (bigramEntryToWrite->getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) ? + Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : + bigramEntryToWrite->getTargetTerminalId(); + if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite, + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) { + AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d", + *entryWritingPos, bigramEntryToWrite->getTargetTerminalId()); + return false; + } + return true; +} + +bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos, + int *const outTailEntryPos) { + int readingPos = bigramListPos; + int writingPos = toPos; + bool hasNext = true; + while (hasNext) { + const BigramEntry bigramEntry = getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!hasNext) { + *outTailEntryPos = writingPos; + } + if (!writeBigramEntryAndAdvancePosition(&bigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalBigramListPos = + originalBigramDictContent->getBigramListHeadPos(it->first); + if (originalBigramListPos == NOT_A_DICT_POS) { + // This terminal does not have a bigram list. + continue; + } + const int bigramListPos = getContentBuffer()->getTailPosition(); + int bigramEntryCount = 0; + // Copy bigram list with GC from original content. + if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, + terminalIdMap, &bigramEntryCount)) { + AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d", + originalBigramListPos, bigramListPos); + return false; + } + if (bigramEntryCount == 0) { + // All bigram entries are useless. This terminal does not have a bigram list. + continue; + } + *outBigramEntryCount += bigramEntryCount; + // Set bigram list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { + AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d", + it->second, bigramListPos); + return false; + } + } + return true; +} + +// Returns whether GC for the bigram list was succeeded or not. +bool BigramDictContent::runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntrycount) { + bool hasNext = true; + int readingPos = bigramListPos; + int writingPos = toPos; + int lastEntryPos = NOT_A_DICT_POS; + while (hasNext) { + const BigramEntry originalBigramEntry = + sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = originalBigramEntry.hasNext(); + if (originalBigramEntry.getTargetTerminalId() == Ver4DictConstants::NOT_A_TERMINAL_ID) { + continue; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + terminalIdMap->find(originalBigramEntry.getTargetTerminalId()); + if (it == terminalIdMap->end()) { + // Target word has been removed. + continue; + } + lastEntryPos = hasNext ? writingPos : NOT_A_DICT_POS; + const BigramEntry updatedBigramEntry = + originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second); + if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) { + AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos); + return false; + } + *outEntrycount += 1; + } + if (lastEntryPos != NOT_A_DICT_POS) { + // Update has next flag in the last written entry. + const BigramEntry bigramEntry = getBigramEntry(lastEntryPos).updateHasNextAndGetEntry( + false /* hasNext */); + if (!writeBigramEntry(&bigramEntry, lastEntryPos)) { + AKLOGE("Cannot write bigram entry to set hasNext flag after GC. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h new file mode 100644 index 000000000..14f334a12 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/bigram_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/bigram_entry.h" +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramDictContent : public SparseTableDictContent { + public: + BigramDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + BigramDictContent(const bool hasHistoricalInfo) + : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), + mHasHistoricalInfo(hasHistoricalInfo) {} + + const BigramEntry getBigramEntry(const int bigramEntryPos) const { + int readingPos = bigramEntryPos; + return getBigramEntryAndAdvancePosition(&readingPos); + } + + const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const; + + // Returns head position of bigram list for a PtNode specified by terminalId. + int getBigramListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); + } + + bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) { + int writingPos = getContentBuffer()->getTailPosition(); + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + + bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { + int writingPos = entryWritingPos; + return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); + } + + bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite, + int *const entryWritingPos); + + bool createNewBigramList(const int terminalId) { + const int bigramListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); + } + + bool copyBigramList(const int bigramListPos, const int toPos, int *const outTailEntryPos); + + bool flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::BIGRAM_FILE_EXTENSION); + } + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount); + + bool isContentTailPos(const int pos) const { + return pos == getContentBuffer()->getTailPosition(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(BigramDictContent); + + int createAndGetBigramFlags(const bool hasNext) const { + return hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0; + } + + int getBigramEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } else { + return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + } + } + + bool runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntryCount); + + bool mHasHistoricalInfo; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h new file mode 100644 index 000000000..36ad855ee --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/bigram_entry.h + */ + +#ifndef LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H +#define LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class BigramEntry { + public: + BigramEntry(const BigramEntry& bigramEntry) + : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability), + mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(), + mTargetTerminalId(targetTerminalId) {} + + // Entry with historical information. + BigramEntry(const bool hasNext, const int probability, + const HistoricalInfo *const historicalInfo, const int targetTerminalId) + : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo), + mTargetTerminalId(targetTerminalId) {} + + const BigramEntry getInvalidatedEntry() const { + return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID); + } + + const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const { + return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const { + return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId); + } + + const BigramEntry updateProbabilityAndGetEntry(const int probability) const { + return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId); + } + + const BigramEntry updateHistoricalInfoAndGetEntry( + const HistoricalInfo *const historicalInfo) const { + return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId); + } + + bool isValid() const { + return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + } + + bool hasNext() const { + return mHasNext; + } + + int getProbability() const { + return mProbability; + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + int getTargetTerminalId() const { + return mTargetTerminalId; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry); + DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry); + + const bool mHasNext; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + const int mTargetTerminalId; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/dict_content.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/dict_content.h new file mode 100644 index 000000000..d3b84fa04 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/dict_content.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_DICT_CONTENT_H + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class DictContent { + public: + virtual ~DictContent() {} + virtual bool isValid() const = 0; + + protected: + DictContent() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictContent); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp new file mode 100644 index 000000000..b167f0ab2 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/probability_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" + +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + // This method can be called with invalid terminal id during GC. + return ProbabilityEntry(0 /* flags */, NOT_A_PROBABILITY); + } + const BufferWithExtendableBuffer *const buffer = getBuffer(); + int entryPos = getEntryPos(terminalId); + const int flags = buffer->readUintAndAdvancePosition( + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &entryPos); + const int probability = buffer->readUintAndAdvancePosition( + Ver4DictConstants::PROBABILITY_SIZE, &entryPos); + if (mHasHistoricalInfo) { + const int timestamp = buffer->readUintAndAdvancePosition( + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &entryPos); + const int level = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); + const int count = buffer->readUintAndAdvancePosition( + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); + // Hack for better migration. + const HistoricalInfo historicalInfo(timestamp, level, count + level); + return ProbabilityEntry(flags, probability, &historicalInfo); + } else { + return ProbabilityEntry(flags, probability); + } +} + +bool ProbabilityDictContent::setProbabilityEntry(const int terminalId, + const ProbabilityEntry *const probabilityEntry) { + if (terminalId < 0) { + return false; + } + const int entryPos = getEntryPos(terminalId); + if (terminalId >= mSize) { + ProbabilityEntry dummyEntry; + // Write new entry. + int writingPos = getBuffer()->getTailPosition(); + while (writingPos <= entryPos) { + // Fulfilling with dummy entries until writingPos. + if (!writeEntry(&dummyEntry, writingPos)) { + AKLOGE("Cannot write dummy entry. pos: %d, mSize: %d", writingPos, mSize); + return false; + } + writingPos += getEntrySize(); + } + mSize = terminalId + 1; + } + return writeEntry(probabilityEntry, entryPos); +} + +bool ProbabilityDictContent::flushToFile(const char *const dictPath) const { + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + ProbabilityDictContent probabilityDictContentToWrite(mHasHistoricalInfo); + for (int i = 0; i < mSize; ++i) { + const ProbabilityEntry probabilityEntry = getProbabilityEntry(i); + if (!probabilityDictContentToWrite.setProbabilityEntry(i, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in flushToFile. terminalId: %d", i); + return false; + } + } + return probabilityDictContentToWrite.flush(dictPath, + Ver4DictConstants::FREQ_FILE_EXTENSION); + } else { + return flush(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION); + } +} + +bool ProbabilityDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const ProbabilityEntry probabilityEntry = + originalProbabilityDictContent->getProbabilityEntry(it->first); + if (!setProbabilityEntry(it->second, &probabilityEntry)) { + AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second); + return false; + } + } + return true; +} + +int ProbabilityDictContent::getEntrySize() const { + if (mHasHistoricalInfo) { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE; + } else { + return Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE + + Ver4DictConstants::PROBABILITY_SIZE; + } +} + +int ProbabilityDictContent::getEntryPos(const int terminalId) const { + return terminalId * getEntrySize(); +} + +bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry, + const int entryPos) { + BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer(); + int writingPos = entryPos; + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(), + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { + AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(), + Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { + AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos); + return false; + } + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo(); + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(), + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos); + return false; + } + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(), + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) { + AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos); + return false; + } + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h new file mode 100644 index 000000000..464b29f3f --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/probability_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/single_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ProbabilityEntry; + +class ProbabilityDictContent : public SingleDictContent { + public: + ProbabilityDictContent(const char *const dictPath, const bool hasHistoricalInfo, + const bool isUpdatable) + : SingleDictContent(dictPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable), + mHasHistoricalInfo(hasHistoricalInfo), + mSize(getBuffer()->getTailPosition() / getEntrySize()) {} + + ProbabilityDictContent(const bool hasHistoricalInfo) + : mHasHistoricalInfo(hasHistoricalInfo), mSize(0) {} + + const ProbabilityEntry getProbabilityEntry(const int terminalId) const; + + bool setProbabilityEntry(const int terminalId, const ProbabilityEntry *const probabilityEntry); + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent); + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent); + + int getEntrySize() const; + + int getEntryPos(const int terminalId) const; + + bool writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos); + + bool mHasHistoricalInfo; + int mSize; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_entry.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_entry.h new file mode 100644 index 000000000..94e36bf51 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/probability_entry.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/probability_entry.h + */ + +#ifndef LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H +#define LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ProbabilityEntry { + public: + ProbabilityEntry(const ProbabilityEntry &probabilityEntry) + : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), + mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} + + // Dummy entry + ProbabilityEntry() + : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {} + + // Entry without historical information + ProbabilityEntry(const int flags, const int probability) + : mFlags(flags), mProbability(probability), mHistoricalInfo() {} + + // Entry with historical information. + ProbabilityEntry(const int flags, const int probability, + const HistoricalInfo *const historicalInfo) + : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {} + + const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const { + return ProbabilityEntry(mFlags, probability, &mHistoricalInfo); + } + + const ProbabilityEntry createEntryWithUpdatedHistoricalInfo( + const HistoricalInfo *const historicalInfo) const { + return ProbabilityEntry(mFlags, mProbability, historicalInfo); + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + int getFlags() const { + return mFlags; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); + + const int mFlags; + const int mProbability; + const HistoricalInfo mHistoricalInfo; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp new file mode 100644 index 000000000..e538a02a1 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/shortcut_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const { + const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); + if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { + AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", + *shortcutEntryPos, shortcutListBuffer->getTailPosition()); + ASSERT(false); + if (outhasNext) { + *outhasNext = false; + } + if (outCodePointCount) { + *outCodePointCount = 0; + } + return; + } + + const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + if (outProbability) { + *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; + } + if (outhasNext) { + *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + } + if (outCodePoint && outCodePointCount) { + shortcutListBuffer->readCodePointsAndAdvancePosition( + maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); + } +} + +int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); +} + +bool ShortcutDictContent::flushToFile(const char *const dictPath) const { + return flush(dictPath, Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION); +} + +bool ShortcutDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalShortcutListPos = + originalShortcutDictContent->getShortcutListHeadPos(it->first); + if (originalShortcutListPos == NOT_A_DICT_POS) { + continue; + } + const int shortcutListPos = getContentBuffer()->getTailPosition(); + // Copy shortcut list from original content. + if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, + shortcutListPos)) { + AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", + originalShortcutListPos, shortcutListPos); + return false; + } + // Set shortcut list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { + AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", + it->second, shortcutListPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::createNewShortcutList(const int terminalId) { + const int shortcutListListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); +} + +bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { + return copyShortcutListFromDictContent(shortcutListPos, this, toPos); +} + +bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { + bool hasNext = true; + int readingPos = shortcutListPos; + int writingPos = toPos; + int codePoints[MAX_WORD_LENGTH]; + while (hasNext) { + int probability = 0; + int codePointCount = 0; + sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, + codePoints, &codePointCount, &probability, &hasNext, &readingPos); + if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, + hasNext, &writingPos)) { + AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = shortcutListBuffer->readUint( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); + return shortcutListBuffer->writeUint(shortcutFlagsToWrite, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); +} + +bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); + if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); + return false; + } + if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, + true /* writesTerminator */, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); + return false; + } + return true; +} + +// Find a shortcut entry that has specified target and return its position. +int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const { + bool hasNext = true; + int readingPos = shortcutListPos; + int targetCodePoints[MAX_WORD_LENGTH]; + while (hasNext) { + const int entryPos = readingPos; + int probability = 0; + int targetCodePointCount = 0; + getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, + &probability, &hasNext, &readingPos); + if (targetCodePointCount != codePointCount) { + continue; + } + bool matched = true; + for (int i = 0; i < codePointCount; ++i) { + if (targetCodePointsToFind[i] != targetCodePoints[i]) { + matched = false; + break; + } + } + if (matched) { + return entryPos; + } + } + return NOT_A_DICT_POS; +} + +int ShortcutDictContent::createAndGetShortcutFlags(const int probability, + const bool hasNext) const { + return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) + | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h new file mode 100644 index 000000000..3b725e896 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/shortcut_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class ShortcutDictContent : public SparseTableDictContent { + public: + ShortcutDictContent(const char *const dictPath, const bool isUpdatable) + : SparseTableDictContent(dictPath, + Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION, + Ver4DictConstants::SHORTCUT_FILE_EXTENSION, isUpdatable, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + ShortcutDictContent() + : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, int *const outProbability, bool *const outhasNext, + const int shortcutEntryPos) { + int readingPos = shortcutEntryPos; + return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, + outCodePointCount, outProbability, outhasNext, &readingPos); + } + + void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const; + + // Returns head position of shortcut list for a PtNode specified by terminalId. + int getShortcutListHeadPos(const int terminalId) const; + + bool flushToFile(const char *const dictPath) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent); + + bool createNewShortcutList(const int terminalId); + + bool copyShortcutList(const int shortcutListPos, const int toPos); + + bool setProbability(const int probability, const int shortcutEntryPos); + + bool writeShortcutEntry(const int *const codePoint, const int codePointCount, + const int probability, const bool hasNext, const int shortcutEntryPos) { + int writingPos = shortcutEntryPos; + return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, + hasNext, &writingPos); + } + + bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos); + + int findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); + + bool copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); + + int createAndGetShortcutFlags(const int probability, const bool hasNext) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h new file mode 100644 index 000000000..89df2a1e0 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/single_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "utils/byte_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class SingleDictContent : public DictContent { + public: + SingleDictContent(const char *const dictPath, const char *const contentFileName, + const bool isUpdatable) + : mMmappedBuffer(MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableContentBuffer( + mMmappedBuffer ? mMmappedBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mIsValid(mMmappedBuffer) {} + + SingleDictContent() + : mMmappedBuffer(nullptr), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mIsValid(true) {} + + virtual ~SingleDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + BufferWithExtendableBuffer *getWritableBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictPath, const char *const contentFileNameSuffix) const { + return DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + contentFileNameSuffix, &mExpandableContentBuffer); + } + + private: + DISALLOW_COPY_AND_ASSIGN(SingleDictContent); + + const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + const bool mIsValid; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp new file mode 100644 index 000000000..280f0f85a --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/sparse_table_dict_content.cpp + */ + +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool SparseTableDictContent::flush(const char *const dictPath, + const char *const lookupTableFileNameSuffix, const char *const addressTableFileNameSuffix, + const char *const contentFileNameSuffix) const { + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, lookupTableFileNameSuffix, + &mExpandableLookupTableBuffer)){ + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, addressTableFileNameSuffix, + &mExpandableAddressTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, contentFileNameSuffix, + &mExpandableContentBuffer)) { + return false; + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h new file mode 100644 index 000000000..4b5af87ad --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/sparse_table_dict_content.h + */ + +#ifndef LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H +#define LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "dictionary/utils/sparse_table.h" +#include "utils/byte_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// TODO: Support multiple contents. +class SparseTableDictContent : public DictContent { + public: + AK_FORCE_INLINE SparseTableDictContent(const char *const dictPath, + const char *const lookupTableFileName, const char *const addressTableFileName, + const char *const contentFileName, const bool isUpdatable, + const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer( + MmappedBuffer::openBuffer(dictPath, lookupTableFileName, isUpdatable)), + mAddressTableBuffer( + MmappedBuffer::openBuffer(dictPath, addressTableFileName, isUpdatable)), + mContentBuffer( + MmappedBuffer::openBuffer(dictPath, contentFileName, isUpdatable)), + mExpandableLookupTableBuffer( + mLookupTableBuffer ? mLookupTableBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableAddressTableBuffer( + mAddressTableBuffer ? mAddressTableBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableContentBuffer( + mContentBuffer ? mContentBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), + mIsValid(mLookupTableBuffer && mAddressTableBuffer && mContentBuffer) {} + + SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) + : mLookupTableBuffer(), mAddressTableBuffer(), mContentBuffer(), + mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize), mIsValid(true) {} + + virtual ~SparseTableDictContent() {} + + virtual bool isValid() const { + return mIsValid; + } + + bool isNearSizeLimit() const { + return mExpandableLookupTableBuffer.isNearSizeLimit() + || mExpandableAddressTableBuffer.isNearSizeLimit() + || mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + SparseTable *getUpdatableAddressLookupTable() { + return &mAddressLookupTable; + } + + const SparseTable *getAddressLookupTable() const { + return &mAddressLookupTable; + } + + BufferWithExtendableBuffer *getWritableContentBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getContentBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(const char *const dictDirPath, const char *const lookupTableFileName, + const char *const addressTableFileName, const char *const contentFileName) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); + + const MmappedBuffer::MmappedBufferPtr mLookupTableBuffer; + const MmappedBuffer::MmappedBufferPtr mAddressTableBuffer; + const MmappedBuffer::MmappedBufferPtr mContentBuffer; + BufferWithExtendableBuffer mExpandableLookupTableBuffer; + BufferWithExtendableBuffer mExpandableAddressTableBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + SparseTable mAddressLookupTable; + const bool mIsValid; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp new file mode 100644 index 000000000..30b72bbd1 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/terminal_position_lookup_table.cpp + */ + +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + return NOT_A_DICT_POS; + } + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); + return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? + NOT_A_DICT_POS : terminalPos; +} + +bool TerminalPositionLookupTable::setTerminalPtNodePosition( + const int terminalId, const int terminalPtNodePos) { + if (terminalId < 0) { + return NOT_A_DICT_POS; + } + while (terminalId >= mSize) { + // Write new entry. + if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { + return false; + } + mSize++; + } + const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? + terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; + return getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); +} + +bool TerminalPositionLookupTable::flushToFile(const char *const dictPath) const { + // If the used buffer size is smaller than the actual buffer size, regenerate the lookup + // table and write the new table to the file. + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + TerminalPositionLookupTable lookupTableToWrite; + for (int i = 0; i < mSize; ++i) { + const int terminalPtNodePosition = getTerminalPtNodePosition(i); + if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { + AKLOGE("Cannot set terminal position to lookupTableToWrite." + " terminalId: %d, position: %d", i, terminalPtNodePosition); + return false; + } + } + return lookupTableToWrite.flush(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } else { + // We can simply use this lookup table because the buffer size has not been + // changed. + return flush(dictPath, Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION); + } +} + +bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { + int removedEntryCount = 0; + int nextNewTerminalId = 0; + for (int i = 0; i < mSize; ++i) { + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); + if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { + // This entry is a garbage. + removedEntryCount++; + } else { + // Give a new terminal id to the entry. + if (!getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, + getEntryPos(nextNewTerminalId))) { + return false; + } + // Memorize the mapping to the old terminal id to the new terminal id. + terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); + nextNewTerminalId++; + } + } + mSize = nextNewTerminalId; + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h b/app/src/main/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h new file mode 100644 index 000000000..641c7496f --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/content/terminal_position_lookup_table.h + */ + +#ifndef LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H +#define LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H + +#include + +#include "defines.h" +#include "dictionary/structure/backward/v402/content/single_dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class TerminalPositionLookupTable : public SingleDictContent { + public: + typedef std::unordered_map TerminalIdMap; + + TerminalPositionLookupTable(const char *const dictPath, const bool isUpdatable) + : SingleDictContent(dictPath, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION, isUpdatable), + mSize(getBuffer()->getTailPosition() + / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} + + TerminalPositionLookupTable() : mSize(0) {} + + int getTerminalPtNodePosition(const int terminalId) const; + + bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); + + int getNextTerminalId() const { + return mSize; + } + + bool flushToFile(const char *const dictPath) const; + + bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); + + private: + DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); + + int getEntryPos(const int terminalId) const { + return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + } + + int mSize; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h b/app/src/main/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h new file mode 100644 index 000000000..8cda8c5cf --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable) + : mShortcutDictContent(shortcutDictContent) {} + + ~Ver4ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + // The first shortcut entry is located at the head position of the shortcut list. + return pos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + int probability = 0; + mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, + outCodePoint, outCodePointCount, &probability, outHasNext, pos); + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); + } + } + + void skipAllShortcuts(int *const pos) const { + // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. + } + + bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, + const int probability) { + const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (shortcutListPos == NOT_A_DICT_POS) { + // Create shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, + false /* hasNext */, writingPos); + } + const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, + codePoints, codePointCount); + if (entryPos == NOT_A_DICT_POS) { + // Add new entry to the shortcut list. + // Create new shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, + codePointCount, probability, true /* hasNext */, &writingPos)) { + AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, + writingPos); + return false; + } + return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); + } + // Overwrite existing entry. + bool hasNext = false; + mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, + 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); + if (!mShortcutDictContent->writeShortcutEntry(codePoints, + codePointCount, probability, hasNext, entryPos)) { + AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, + entryPos); + return false; + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); + + ShortcutDictContent *const mShortcutDictContent; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp new file mode 100644 index 000000000..4a9704f4d --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_buffers.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" + +#include +#include +#include +#include + +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( + const char *const dictPath, MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion) { + if (!headerBuffer) { + ASSERT(false); + AKLOGE("The header buffer must be valid to open ver4 dict buffers."); + return Ver4DictBuffersPtr(nullptr); + } + // TODO: take only dictDirPath, and open both header and trie files in the constructor below + const bool isUpdatable = headerBuffer->isUpdatable(); + return Ver4DictBuffersPtr(new Ver4DictBuffers(dictPath, std::move(headerBuffer), isUpdatable, + formatVersion)); +} + +bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const { + // Create temporary directory. + const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + char tmpDirPath[tmpDirPathBufSize]; + FileUtils::getFilePathWithSuffix(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, + tmpDirPath); + if (FileUtils::existsDir(tmpDirPath)) { + if (!FileUtils::removeDirAndFiles(tmpDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); + ASSERT(false); + return false; + } + } + umask(S_IWGRP | S_IWOTH); + if (mkdir(tmpDirPath, S_IRWXU) == -1) { + AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); + return false; + } + // Get dictionary base path. + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); + char dictPath[dictPathBufSize]; + FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); + + // Write header file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { + AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::HEADER_FILE_EXTENSION); + return false; + } + // Write trie file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, &mExpandableTrieBuffer)) { + AKLOGE("Dictionary trie file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::TRIE_FILE_EXTENSION); + return false; + } + // Write dictionary contents. + if (!mTerminalPositionLookupTable.flushToFile(dictPath)) { + AKLOGE("Terminal position lookup table cannot be written. %s", tmpDirPath); + return false; + } + if (!mProbabilityDictContent.flushToFile(dictPath)) { + AKLOGE("Probability dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mBigramDictContent.flushToFile(dictPath)) { + AKLOGE("Bigram dict content cannot be written. %s", tmpDirPath); + return false; + } + if (!mShortcutDictContent.flushToFile(dictPath)) { + AKLOGE("Shortcut dict content cannot be written. %s", tmpDirPath); + return false; + } + // Remove existing dictionary. + if (!FileUtils::removeDirAndFiles(dictDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", dictDirPath); + ASSERT(false); + return false; + } + // Rename temporary directory. + if (rename(tmpDirPath, dictDirPath) != 0) { + AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); + ASSERT(false); + return false; + } + return true; +} + +Ver4DictBuffers::Ver4DictBuffers(const char *const dictPath, + MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion) + : mHeaderBuffer(std::move(headerBuffer)), + mDictBuffer(MmappedBuffer::openBuffer(dictPath, + Ver4DictConstants::TRIE_FILE_EXTENSION, isUpdatable)), + mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion), + mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableTrieBuffer( + mDictBuffer ? mDictBuffer->getReadWriteByteArrayView() : + ReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mTerminalPositionLookupTable(dictPath, isUpdatable), + mProbabilityDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), + mBigramDictContent(dictPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), + mShortcutDictContent(dictPath, isUpdatable), + mIsUpdatable(isUpdatable) {} + +Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) + : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy), + mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), + mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()), + mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(), + mIsUpdatable(true) {} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h new file mode 100644 index 000000000..0d09fee9a --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_buffers.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H +#define LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H + +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/mmapped_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +class Ver4DictBuffers { + public: + typedef std::unique_ptr Ver4DictBuffersPtr; + + static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, + MmappedBuffer::MmappedBufferPtr headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion); + + static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( + const HeaderPolicy *const headerPolicy, const int maxTrieSize) { + return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize)); + } + + AK_FORCE_INLINE bool isValid() const { + return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid() + && mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid() + && mBigramDictContent.isValid() && mShortcutDictContent.isValid(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mExpandableTrieBuffer.isNearSizeLimit() + || mTerminalPositionLookupTable.isNearSizeLimit() + || mProbabilityDictContent.isNearSizeLimit() + || mBigramDictContent.isNearSizeLimit() + || mShortcutDictContent.isNearSizeLimit(); + } + + AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { + return &mHeaderPolicy; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { + return &mExpandableHeaderBuffer; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE ProbabilityDictContent *getMutableProbabilityDictContent() { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE const ProbabilityDictContent *getProbabilityDictContent() const { + return &mProbabilityDictContent; + } + + AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() { + return &mBigramDictContent; + } + + AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const { + return &mBigramDictContent; + } + + AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + bool flush(const char *const dictDirPath) const { + return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); + } + + bool flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); + + Ver4DictBuffers(const char *const dictDirPath, + const MmappedBuffer::MmappedBufferPtr headerBuffer, const bool isUpdatable, + const FormatUtils::FORMAT_VERSION formatVersion); + + Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); + + const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; + const MmappedBuffer::MmappedBufferPtr mDictBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mExpandableHeaderBuffer; + BufferWithExtendableBuffer mExpandableTrieBuffer; + TerminalPositionLookupTable mTerminalPositionLookupTable; + ProbabilityDictContent mProbabilityDictContent; + BigramDictContent mBigramDictContent; + ShortcutDictContent mShortcutDictContent; + const int mIsUpdatable; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp new file mode 100644 index 000000000..2948d0716 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_constants.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// These values MUST match the definitions in FormatSpec.java. +const char *const Ver4DictConstants::TRIE_FILE_EXTENSION = ".trie"; +const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; +const char *const Ver4DictConstants::FREQ_FILE_EXTENSION = ".freq"; +// tat = Terminal Address Table +const char *const Ver4DictConstants::TERMINAL_ADDRESS_TABLE_FILE_EXTENSION = ".tat"; +const char *const Ver4DictConstants::BIGRAM_FILE_EXTENSION = ".bigram_freq"; +const char *const Ver4DictConstants::BIGRAM_LOOKUP_TABLE_FILE_EXTENSION = ".bigram_lookup"; +const char *const Ver4DictConstants::BIGRAM_CONTENT_TABLE_FILE_EXTENSION = ".bigram_index_freq"; +const char *const Ver4DictConstants::SHORTCUT_FILE_EXTENSION = ".shortcut_shortcut"; +const char *const Ver4DictConstants::SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION = ".shortcut_lookup"; +const char *const Ver4DictConstants::SHORTCUT_CONTENT_TABLE_FILE_EXTENSION = + ".shortcut_index_shortcut"; + +// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. +const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; +// Extended region size, which is not GCed region size in dict file + additional buffer size, is +// limited to 1MB to prevent from inefficient traversing. +const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; + +const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; +const int Ver4DictConstants::PROBABILITY_SIZE = 1; +const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1; +const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; +const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; +const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; +const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; + +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; +const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; + +const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3; +// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing +// invalid terminal ID in bigram lists. +const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID = + (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1; +const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::BIGRAM_HAS_NEXT_MASK = 0x80; +const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1; + +const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h new file mode 100644 index 000000000..15581d852 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_dict_constants.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H +#define LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// TODO: Create PtConstants under the pt_common and move some constant values there. +// Note that there are corresponding definitions in FormatSpec.java. +class Ver4DictConstants { + public: + static const char *const TRIE_FILE_EXTENSION; + static const char *const HEADER_FILE_EXTENSION; + static const char *const FREQ_FILE_EXTENSION; + static const char *const TERMINAL_ADDRESS_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_FILE_EXTENSION; + static const char *const BIGRAM_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const BIGRAM_CONTENT_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_FILE_EXTENSION; + static const char *const SHORTCUT_LOOKUP_TABLE_FILE_EXTENSION; + static const char *const SHORTCUT_CONTENT_TABLE_FILE_EXTENSION; + + static const int MAX_DICTIONARY_SIZE; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + + static const int NOT_A_TERMINAL_ID; + static const int PROBABILITY_SIZE; + static const int FLAGS_IN_PROBABILITY_FILE_SIZE; + static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + static const int NOT_A_TERMINAL_ADDRESS; + static const int TERMINAL_ID_FIELD_SIZE; + static const int TIME_STAMP_FIELD_SIZE; + static const int WORD_LEVEL_FIELD_SIZE; + static const int WORD_COUNT_FIELD_SIZE; + + static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE; + static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; + + static const int BIGRAM_FLAGS_FIELD_SIZE; + static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; + static const int INVALID_BIGRAM_TARGET_TERMINAL_ID; + static const int BIGRAM_PROBABILITY_MASK; + static const int BIGRAM_HAS_NEXT_MASK; + // Used when bigram list has time stamp. + static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE; + + static const int SHORTCUT_FLAGS_FIELD_SIZE; + static const int SHORTCUT_PROBABILITY_MASK; + static const int SHORTCUT_HAS_NEXT_MASK; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..871ef7aaf --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int siblingNodePos) const { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + return PtNodeParams(); + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + const int headPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + dictBuf, &pos); + const int parentPos = + DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); + int codePoints[MAX_WORD_LENGTH]; + const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos); + int terminalIdFieldPos = NOT_A_DICT_POS; + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + int probability = NOT_A_PROBABILITY; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + terminalIdFieldPos = pos; + if (usesAdditionalBuffer) { + terminalIdFieldPos += mBuffer->getOriginalBufferSize(); + } + terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); + const ProbabilityEntry probabilityEntry = + mProbabilityDictContent->getProbabilityEntry(terminalId); + if (probabilityEntry.hasHistoricalInfo()) { + probability = ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo(), mHeaderPolicy); + } else { + probability = probabilityEntry.getProbability(); + } + } + int childrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + childrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { + childrenPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + // Sibling position is the tail position of original PtNode. + int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; + // Read destination node if the read node is a moved node. + if (DynamicPtReadingUtils::isMoved(flags)) { + // The destination position is stored at the same place as the parent position. + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); + } else { + return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, + terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, + newSiblingNodePos); + } +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h new file mode 100644 index 000000000..367d6f9f8 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_reader.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class ProbabilityDictContent; + +/* + * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved + * node and reads node attributes including probability form probabilityBuffer. + */ +class Ver4PatriciaTrieNodeReader : public PtNodeReader { + public: + Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, + const ProbabilityDictContent *const probabilityDictContent, + const HeaderPolicy *const headerPolicy) + : mBuffer(buffer), mProbabilityDictContent(probabilityDictContent), + mHeaderPolicy(headerPolicy) {} + + ~Ver4PatriciaTrieNodeReader() {} + + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, + NOT_A_DICT_POS /* siblingNodePos */); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + const ProbabilityDictContent *const mProbabilityDictContent; + const HeaderPolicy *const mHeaderPolicy; + + const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int siblingNodePos) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp new file mode 100644 index 000000000..e3ab5ec20 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp @@ -0,0 +1,442 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->isTerminal()) { + // The PtNode is a terminal. Delete entry from the terminal position lookup table. + return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); + } else { + return true; + } +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->hasChildren()) { + // Update children's parent position. + mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); + while (!mReadingHelper.isEnd()) { + const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); + int parentOffsetFieldPos = childPtNodeParams.getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), + &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + mReadingHelper.readNextSiblingNode(childPtNodeParams); + } + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + false /* isDeleted */, true /* willBecomeNonTerminal */); + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { + AKLOGE("Cannot update terminal position lookup table. terminal id: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + // Update flags. + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) { + // Update probability and historical information. + // TODO: Update other information in the unigram property. + if (!toBeUpdatedPtNodeParams->isTerminal()) { + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, + unigramProperty); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { + if (!toBeUpdatedPtNodeParams->isTerminal()) { + AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + if (originalProbabilityEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); + const ProbabilityEntry probabilityEntry = + originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); + if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { + AKLOGE("Cannot write updated probability entry. terminalId: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); + if (!isValid) { + if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); + return false; + } + } + *outNeedsToKeepPtNode = isValid; + } else { + // No need to update probability. + *outNeedsToKeepPtNode = true; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( + const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { + int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); + return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + newChildrenPosition, &childrenPosFieldPos); +} + +bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId) { + return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, + toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { + return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, + ptNodeWritingPos); +} + + +bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, + int *const ptNodeWritingPos) { + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, + ptNodeWritingPos)) { + return false; + } + // Write probability. + ProbabilityEntry newProbabilityEntry; + const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( + &newProbabilityEntry, unigramProperty); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId, + &probabilityEntryToWrite); +} + +bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { + if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, ngramProperty, outAddedNewEntry)) { + AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d", + prevWordIds[0], wordId); + return false; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(prevWordIds[0]); + const PtNodeParams sourcePtNodeParams = + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (!sourcePtNodeParams.hasBigrams()) { + // Update has bigrams flag. + return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(), + sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(), + sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(), + true /* hasBigrams */, + sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, + const int wordId) { + return mBigramPolicy->removeEntry(prevWordIds[0], wordId); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { + return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries( + sourcePtNodeParams->getTerminalId(), outBigramEntryCount); +} + +bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) { + int parentPos = toBeUpdatedPtNodeParams->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = toBeUpdatedPtNodeParams->getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { + return false; + } + + // Counts bigram entries. + if (outBigramEntryCount) { + *outBigramEntryCount = mBigramPolicy->getBigramEntryConut( + toBeUpdatedPtNodeParams->getTerminalId()); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), + targetCodePoints, targetCodePointCount, shortcutProbability)) { + AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); + return false; + } + if (!ptNodeParams->hasShortcutTargets()) { + // Update has shortcut targets flag. + return updatePtNodeFlags(ptNodeParams->getHeadPos(), + ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(), + ptNodeParams->isTerminal(), true /* hasShortcutTargets */, + ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags( + const PtNodeParams *const ptNodeParams) { + const bool hasBigrams = mBuffers->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; + return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(), + ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets, + hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos) { + const int nodePos = *ptNodeWritingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, + 0 /* nodeFlags */, ptNodeWritingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { + return false; + } + // Write code points + if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, + ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { + return false; + } + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!ptNodeParams->willBecomeNonTerminal()) { + if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { + terminalId = ptNodeParams->getTerminalId(); + } else if (ptNodeParams->isTerminal()) { + // Write terminal information using a new terminal id. + // Get a new unused terminal id. + terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); + } + } + const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + if (isTerminal) { + // Update the lookup table. + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + terminalId, nodePos)) { + return false; + } + // Write terminal Id. + if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, + Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { + return false; + } + if (outTerminalId) { + *outTerminalId = terminalId; + } + } + // Write children position + if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { + return false; + } + return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(), + ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(), + ptNodeParams->hasBigrams(), + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const { + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + const HistoricalInfo &historicalInfoForUpdate = unigramProperty->getHistoricalInfo(); + const HistoricalInfo updatedHistoricalInfo = + ForgettingCurveUtils::createUpdatedHistoricalInfo( + originalProbabilityEntry->getHistoricalInfo(), + unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); + return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( + &updatedHistoricalInfo); + } else { + return originalProbabilityEntry->createEntryWithUpdatedProbability( + unigramProperty->getProbability()); + } +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, + const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, + const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars) { + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, + hasShortcutTargets, hasBigrams, hasMultipleChars, + CHILDREN_POSITION_FIELD_SIZE); + if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { + AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); + return false; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::suppressUnigramEntry(const PtNodeParams *const ptNodeParams) { + if (!mHeaderPolicy->hasHistoricalInfoOfWords()) { + // Require historical info to suppress unigram entry. + return false; + } + const HistoricalInfo suppressedHistorycalInfo(0 /* timestamp */, 0 /* level */, 0 /* count */); + const ProbabilityEntry probabilityEntryToWrite = + ProbabilityEntry().createEntryWithUpdatedHistoricalInfo(&suppressedHistorycalInfo); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + ptNodeParams->getTerminalId(), &probabilityEntryToWrite); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h new file mode 100644 index 000000000..db3cea174 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_node_writer.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "utils/int_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class Ver4BigramListPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PtNodeArrayReader; +class Ver4ShortcutListPolicy; + +/* + * This class is used for helping to writes nodes of ver4 patricia trie. + */ +class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { + public: + Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, + Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy, + const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader, + Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) + : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy), + mPtNodeReader(ptNodeReader), mReadingHelper(ptNodeReader, ptNodeArrayReader), + mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} + + virtual ~Ver4PatriciaTrieNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos); + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty); + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition); + + bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId); + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos); + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); + + virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount); + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability); + + bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); + + // Suppress unigram not to use the word for generating suggestions. So, this method can be used + // only for dictionaries with historical info. Also, suppressed entries are included in unigram + // count. They will be removed from the dictionary during GC. + bool suppressUnigramEntry(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); + + bool writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos); + + // Create updated probability entry using given unigram property. In addition to the + // probability, this method updates historical information if needed. + // TODO: Update flags belonging to the unigram property. + const ProbabilityEntry createUpdatedEntryFrom( + const ProbabilityEntry *const originalProbabilityEntry, + const UnigramProperty *const unigramProperty) const; + + bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, + const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, + const bool hasMultipleChars); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mTrieBuffer; + Ver4DictBuffers *const mBuffers; + const HeaderPolicy *const mHeaderPolicy; + const PtNodeReader *const mPtNodeReader; + DynamicPtReadingHelper mReadingHelper; + Ver4BigramListPolicy *const mBigramPolicy; + Ver4ShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp new file mode 100644 index 000000000..6fb9cffb7 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -0,0 +1,662 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_policy.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" + +#include + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; +const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; +const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; + +void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); + while (!readingHelper.isEnd()) { + const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); + if (!ptNodeParams.isValid()) { + break; + } + bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); + if (isTerminal && mHeaderPolicy->isDecayingDict()) { + // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose + // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a + // valid terminal DicNode. + isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; + } + readingHelper.readNextSiblingNode(ptNodeParams); + if (ptNodeParams.representsNonWordInfo()) { + // Skip PtNodes that represent non-word information. + continue; + } + const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), + wordId, ptNodeParams.getCodePointArrayView()); + } + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } +} + +int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + readingHelper.initWithPtNodePos(ptNodePos); + const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( + maxCodePointCount, outCodePoints); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); + } + return codePointCount; +} + +int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getWordId()."); + } + return getWordIdFromTerminalPtNodePos(ptNodePos); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (multiBigramMap) { + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); + } + if (!prevWordIds.empty()) { + const int probability = getProbabilityOfWord(prevWordIds, wordId); + if (probability != NOT_A_PROBABILITY) { + return getWordAttributes(probability, ptNodeParams); + } + } + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.getProbability() == 0); +} + +int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + // In the v4 format, bigramProbability is a conditional probability. + const int bigramConditionalProbability = bigramProbability; + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } + if (bigramConditionalProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } + return bigramConditionalProbability; +} + +int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID) { + return NOT_A_PROBABILITY; + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) { + return NOT_A_PROBABILITY; + } + if (prevWordIds.empty()) { + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + } + if (prevWordIds[0] == NOT_A_WORD_ID) { + return NOT_A_PROBABILITY; + } + const PtNodeParams prevWordPtNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); + if (prevWordPtNodeParams.isDeleted()) { + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + } + const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( + prevWordPtNodeParams.getTerminalId()); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == ptNodePos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { + const int bigramConditionalProbability = getBigramConditionalProbability( + prevWordPtNodeParams.getProbability(), + prevWordPtNodeParams.representsBeginningOfSentence(), + bigramsIt.getProbability()); + return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability); + } + } + return NOT_A_PROBABILITY; +} + +void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const { + if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) { + return; + } + const PtNodeParams prevWordPtNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); + if (prevWordPtNodeParams.isDeleted()) { + return; + } + const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( + prevWordPtNodeParams.getTerminalId()); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + const int bigramConditionalProbability = getBigramConditionalProbability( + prevWordPtNodeParams.getProbability(), + prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability()); + listener->onVisitEntry(bigramConditionalProbability, + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); + } +} + +int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability, + const bool isInBeginningOfSentenceContext, const int bigramProbability) const { + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + if (isInBeginningOfSentenceContext) { + return bigramProbability; + } + // Calculate conditional probability. + return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability, + MAX_PROBABILITY); + } else { + // bigramProbability is a conditional probability. + return bigramProbability; + } +} + +BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( + const int wordId) const { + const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); + return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); +} + +int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams.getTerminalId()); +} + +int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getBigramDictContent()->getBigramListHeadPos( + ptNodeParams.getTerminalId()); +} + +bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %zd", + wordCodePoints.size()); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", + shortcut.getTargetCodePoints()->size()); + return false; + } + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + bool addedNewUnigram = false; + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = wordCodePoints.size(); + memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, + &addedNewUnigram)) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { + mEntryCounters.incrementNgramCount(NgramType::Unigram); + } + if (unigramProperty->getShortcuts().size() > 0) { + // Add shortcut target. + const int wordPos = getTerminalPtNodePosFromWordId( + getWordId(codePointArrayView, false /* forceLowerCaseSearch */)); + if (wordPos == NOT_A_DICT_POS) { + AKLOGE("Cannot find terminal PtNode position to add shortcut target."); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (!mUpdatingHelper.addShortcutTarget(wordPos, + CodePointArrayView(*shortcut.getTargetCodePoints()), + shortcut.getProbability())) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " + "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), + shortcut.getProbability()); + return false; + } + } + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int ptNodePos = getTerminalPtNodePosFromWordId( + getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); + if (ptNodePos == NOT_A_DICT_POS) { + return false; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + return mNodeWriter.suppressUnigramEntry(&ptNodeParams); +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + const NgramContext *const ngramContext = ngramProperty->getNgramContext(); + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); + return false; + } + if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert the ngram to the dictionary. " + "length: %zd", ngramProperty->getTargetCodePoints()->size()); + return false; + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (prevWordIds.empty()) { + return false; + } + if (prevWordIds[0] == NOT_A_WORD_ID) { + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, true /* isNotAWord */, + false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo()); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } else { + return false; + } + } + const int wordPos = getTerminalPtNodePosFromWordId(getWordId( + CodePointArrayView(*ngramProperty->getTargetCodePoints()), + false /* forceLowerCaseSearch */)); + if (wordPos == NOT_A_DICT_POS) { + return false; + } + bool addedNewBigram = false; + const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); + if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), + wordPos, ngramProperty, &addedNewBigram)) { + if (addedNewBigram) { + mEntryCounters.incrementNgramCount(NgramType::Bigram); + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", + wordCodePoints.size()); + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSerch */); + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { + return false; + } + const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints, + false /* forceLowerCaseSearch */)); + if (wordPos == NOT_A_DICT_POS) { + return false; + } + const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); + if (mUpdatingHelper.removeNgramEntry( + PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { + mEntryCounters.decrementNgramCount(NgramType::Bigram); + return true; + } else { + return false; + } +} + + +bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( + const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, + const bool isValidWord, const HistoricalInfo historicalInfo) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY; + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo); + if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { + AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) + ? NOT_A_PROBABILITY : probability; + const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram, + historicalInfo); + if (!addNgramEntry(&ngramProperty)) { + AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); + return false; + } + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { + AKLOGE("Cannot flush the dictionary to file."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { + AKLOGE("Cannot flush the dictionary to file with GC."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBuffers->isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() + > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size of the trie exceeds the limit. + return true; + } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mDictBuffer->getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy->isDecayingDict()) { + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), + mHeaderPolicy); + } + return false; +} + +void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, + char *const outResult, const int maxResultLength) { + const int compareLength = queryLength + 1 /* terminator */; + if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mEntryCounters.getNgramCount(NgramType::Unigram)); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Unigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Bigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } +} + +const WordProperty Ver4PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int ptNodePos = getTerminalPtNodePosFromWordId( + getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); + if (ptNodePos == NOT_A_DICT_POS) { + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + const ProbabilityEntry probabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + ptNodeParams.getTerminalId()); + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + // Fetch bigram information. + std::vector ngrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + if (bigramListPos != NOT_A_DICT_POS) { + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); + const TerminalPositionLookupTable *const terminalPositionLookupTable = + mBuffers->getTerminalPositionLookupTable(); + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + const int word1TerminalId = bigramEntry.getTargetTerminalId(); + const int word1TerminalPtNodePos = + terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); + if (word1TerminalPtNodePos == NOT_A_DICT_POS) { + continue; + } + const int codePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH, + bigramWord1CodePoints); + const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); + const int rawBigramProbability = bigramEntry.hasHistoricalInfo() + ? ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mHeaderPolicy) + : bigramEntry.getProbability(); + const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(), + ptNodeParams.representsBeginningOfSentence(), rawBigramProbability); + ngrams.emplace_back( + NgramContext(wordCodePoints.data(), wordCodePoints.size(), + ptNodeParams.representsBeginningOfSentence()), + CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(), + probability, *historicalInfo); + } + } + // Fetch shortcut information. + std::vector shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers->getShortcutDictContent(); + bool hasNext = true; + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + shortcuts.emplace_back( + CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), + shortcutProbability); + } + } + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), + ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + *outCodePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { + return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; +} + +int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { + return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h new file mode 100644 index 000000000..bce5f6bea --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! + * Do not edit this file other than updating policy's interface. + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_policy.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "utils/int_array_view.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class DicNode; +namespace backward { +namespace v402 { +} // namespace v402 +} // namespace backward +class DicNodeVector; +namespace backward { +namespace v402 { + +// Word id = Position of a PtNode that represents the word. +// Max supported n-gram is bigram. +class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) + : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), + mDictBuffer(mBuffers->getWritableTrieBuffer()), + mBigramPolicy(mBuffers->getMutableBigramDictContent(), + mBuffers->getTerminalPositionLookupTable(), mHeaderPolicy), + mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()), + mNodeReader(mDictBuffer, mBuffers->getProbabilityDictContent(), mHeaderPolicy), + mPtNodeArrayReader(mDictBuffer), + mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, + &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), + mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), + mWritingHelper(mBuffers.get()), + mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), + mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; + + virtual int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; + + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; + + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; + + void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const; + + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return mHeaderPolicy; + } + + bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty); + + bool removeUnigramEntry(const CodePointArrayView wordCodePoints); + + bool addNgramEntry(const NgramProperty *const ngramProperty); + + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints); + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); + + bool flush(const char *const filePath); + + bool flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC) const; + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength); + + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); + + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + // When the dictionary size is near the maximum size, we have to refuse dynamic operations to + // prevent the dictionary from overflowing. + static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int DUMMY_PROBABILITY_FOR_VALID_WORDS; + + const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; + const HeaderPolicy *const mHeaderPolicy; + BufferWithExtendableBuffer *const mDictBuffer; + Ver4BigramListPolicy mBigramPolicy; + Ver4ShortcutListPolicy mShortcutPolicy; + Ver4PatriciaTrieNodeReader mNodeReader; + Ver4PtNodeArrayReader mPtNodeArrayReader; + Ver4PatriciaTrieNodeWriter mNodeWriter; + DynamicPtUpdatingHelper mUpdatingHelper; + Ver4PatriciaTrieWritingHelper mWritingHelper; + MutableEntryCounters mEntryCounters; + std::vector mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; + int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; + int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; + int getBigramConditionalProbability(const int prevWordUnigramProbability, + const bool isInBeginningOfSentenceContext, const int bigramProbability) const; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif // LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..b8a4cf847 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( + const uint8_t *const buffer, int *pos) { + return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h new file mode 100644 index 000000000..c3e736bdc --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_reading_utils.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { + +class Ver4PatriciaTrieReadingUtils { + public: + static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..c0af9eae6 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" + +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, + const EntryCounts &entryCounts) const { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + entryCounts, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " + "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), + entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize); + return false; + } + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const dictDirPath) { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( + Ver4DictBuffers::createVer4DictBuffers(headerPolicy, + Ver4DictConstants::MAX_DICTIONARY_SIZE)); + int unigramCount = 0; + int bigramCount = 0; + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { + return false; + } + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + MutableEntryCounters entryCounters; + entryCounters.setNgramCount(NgramType::Unigram, unigramCount); + entryCounters.setNgramCount(NgramType::Bigram, bigramCount); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { + return false; + } + return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, + int *const outUnigramCount, int *const outBigramCount) { + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), + mBuffers->getProbabilityDictContent(), headerPolicy); + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); + Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), + mBuffers->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), + mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); + + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + &ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount(); + const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram); + if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { + if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { + AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, + maxUnigramCount); + return false; + } + } + + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability + traversePolicyToUpdateBigramProbability(&ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateBigramProbability)) { + return false; + } + const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); + const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram); + if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { + if (!truncateBigrams(maxBigramCount)) { + AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); + return false; + } + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, + &shortcutPolicy); + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instances for the GCed dictionary. + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), + buffersToWrite->getProbabilityDictContent(), headerPolicy); + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); + Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), + buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), + buffersToWrite->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, + &newShortcutPolicy); + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + // Run GC for probability dict content. + if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap, + mBuffers->getProbabilityDictContent())) { + return false; + } + // Run GC for bigram dict content. + if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, + mBuffers->getBigramDictContent(), outBigramCount)) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { + return false; + } + *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( + const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue, DictProbabilityComparator> + priorityQueue; + for (int i = 0; i < nextTerminalId; ++i) { + const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); + if (terminalPos == NOT_A_DICT_POS) { + continue; + } + const ProbabilityEntry probabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry(i); + const int probability = probabilityEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + probabilityEntry.getProbability(); + priorityQueue.push(DictProbability(terminalPos, probability, + probabilityEntry.getHistoricalInfo()->getTimestamp())); + } + + // Delete unigrams. + while (static_cast(priorityQueue.size()) > maxUnigramCount) { + const int ptNodePos = priorityQueue.top().getDictPos(); + priorityQueue.pop(); + const PtNodeParams ptNodeParams = + ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.representsNonWordInfo()) { + continue; + } + if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); + return false; + } + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { + const TerminalPositionLookupTable *const terminalPosLookupTable = + mBuffers->getTerminalPositionLookupTable(); + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); + std::priority_queue, DictProbabilityComparator> + priorityQueue; + BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); + for (int i = 0; i < nextTerminalId; ++i) { + const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); + if (bigramListPos == NOT_A_DICT_POS) { + continue; + } + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const int entryPos = readingPos; + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + if (!bigramEntry.isValid()) { + continue; + } + const int probability = bigramEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : + bigramEntry.getProbability(); + priorityQueue.push(DictProbability(entryPos, probability, + bigramEntry.getHistoricalInfo()->getTimestamp())); + } + } + + // Delete bigrams. + while (static_cast(priorityQueue.size()) > maxBigramCount) { + const int entryPos = priorityQueue.top().getDictPos(); + const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); + const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); + if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { + AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); + return false; + } + priorityQueue.pop(); + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + } + return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h new file mode 100644 index 000000000..f2b873826 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_patricia_trie_writing_helper.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/utils/entry_counters.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class HeaderPolicy; +namespace backward { +namespace v402 { +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PatriciaTrieNodeWriter; + +class Ver4PatriciaTrieWritingHelper { + public: + Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) + : mBuffers(buffers) {} + + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; + + // This method cannot be const because the original dictionary buffer will be updated to detect + // useless PtNodes during GC. + bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); + + class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) + : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); + + Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; + const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbability { + public: + DictProbability(const int dictPos, const int probability, const int timestamp) + : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {} + + int getDictPos() const { + return mDictPos; + } + + int getProbability() const { + return mProbability; + } + + int getTimestamp() const { + return mTimestamp; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability); + + int mDictPos; + int mProbability; + int mTimestamp; + }; + + // For truncateUnigrams() and truncateBigrams(). + class DictProbabilityComparator { + public: + bool operator()(const DictProbability &left, const DictProbability &right) { + if (left.getProbability() != right.getProbability()) { + return left.getProbability() > right.getProbability(); + } + if (left.getTimestamp() != right.getTimestamp()) { + return left.getTimestamp() < right.getTimestamp(); + } + return left.getDictPos() > right.getDictPos(); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator); + }; + + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, + int *const outBigramCount); + + bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader, + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount); + + bool truncateBigrams(const int maxBigramCount); + + Ver4DictBuffers *const mBuffers; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime + +#endif /* LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp new file mode 100644 index 000000000..d27d70816 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_pt_node_array_reader.cpp + */ + +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" + +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace backward { +namespace v402 { + +bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + ptNodeArrayPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = ptNodeArrayPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &readingPos); + if (usesAdditionalBuffer) { + readingPos += mBuffer->getOriginalBufferSize(); + } + if (ptNodeCountInArray < 0) { + AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); + return false; + } + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + forwordLinkPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = forwordLinkPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int nextPtNodeArrayOffset = + DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); + if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { + *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; + } else { + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + } + return true; +} + +} // namespace v402 +} // namespace backward +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h new file mode 100644 index 000000000..0039bf8fc --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * !!!!! DO NOT EDIT THIS FILE !!!!! + * + * This file was generated from + * dictionary/structure/v4/ver4_pt_node_array_reader.h + */ + +#ifndef LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H +#define LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" + +namespace latinime { +namespace backward { +namespace v402 { + +} // namespace v402 +} // namespace backward +class BufferWithExtendableBuffer; +namespace backward { +namespace v402 { + +class Ver4PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace v402 +} // namespace backward +} // namespace latinime +#endif /* LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/app/src/main/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp new file mode 100644 index 000000000..4470e8568 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" + +#include + +#include "defines.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/v2/patricia_trie_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/ver4_patricia_trie_policy.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForExistingDictFile( + const char *const path, const int bufOffset, const int size, + const bool isUpdatable) { + if (FileUtils::existsDir(path)) { + // Given path represents a directory. + return newPolicyForDirectoryDict(path, isUpdatable); + } else { + if (isUpdatable) { + AKLOGE("One file dictionaries don't support updating. path: %s", path); + ASSERT(false); + return nullptr; + } + return newPolicyForFileDict(path, bufOffset, size); + } +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict( + const int formatVersion, const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); + switch (dictFormatVersion) { + case FormatUtils::VERSION_402: { + return newPolicyForOnMemoryV4Dict( + dictFormatVersion, locale, attributeMap); + } + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_403: { + return newPolicyForOnMemoryV4Dict( + dictFormatVersion, locale, attributeMap); + } + default: + AKLOGE("DICT: dictionary format %d is not supported for on memory dictionary", + formatVersion); + break; + } + return nullptr; +} + +template +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForOnMemoryV4Dict( + const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + HeaderPolicy headerPolicy(formatVersion, locale, attributeMap); + DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, + DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); + if (!DynamicPtWritingUtils::writeEmptyDictionary( + dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { + AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); + return nullptr; + } + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new StructurePolicy(std::move(dictBuffers))); +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForDirectoryDict( + const char *const path, const bool isUpdatable) { + const int headerFilePathBufSize = PATH_MAX + 1 /* terminator */; + char headerFilePath[headerFilePathBufSize]; + getHeaderFilePathInDictDir(path, headerFilePathBufSize, headerFilePath); + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // MmappedBufferPtr if the instance has the responsibility. + MmappedBuffer::MmappedBufferPtr mmappedBuffer = + MmappedBuffer::openBuffer(headerFilePath, isUpdatable); + if (!mmappedBuffer) { + return nullptr; + } + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion( + mmappedBuffer->getReadOnlyByteArrayView()); + switch (formatVersion) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + case FormatUtils::VERSION_202: + AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path); + break; + case FormatUtils::VERSION_402: { + return newPolicyForV4Dict( + headerFilePath, formatVersion, std::move(mmappedBuffer)); + } + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_403: { + return newPolicyForV4Dict( + headerFilePath, formatVersion, std::move(mmappedBuffer)); + } + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); + break; + } + ASSERT(false); + return nullptr; +} + +template +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForV4Dict( + const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, + MmappedBuffer::MmappedBufferPtr &&mmappedBuffer) { + const int dictDirPathBufSize = strlen(headerFilePath) + 1 /* terminator */; + char dictPath[dictDirPathBufSize]; + if (!FileUtils::getFilePathWithoutSuffix(headerFilePath, + DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) { + AKLOGE("Dictionary file name is not valid as a ver4 dictionary. header path: %s", + headerFilePath); + ASSERT(false); + return nullptr; + } + DictBuffersPtr dictBuffers = + DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion); + if (!dictBuffers || !dictBuffers->isValid()) { + AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s", + dictPath); + ASSERT(false); + return nullptr; + } + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new StructurePolicy(std::move(dictBuffers))); +} + +/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr + DictionaryStructureWithBufferPolicyFactory::newPolicyForFileDict( + const char *const path, const int bufOffset, const int size) { + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // MmappedBufferPtr if the instance has the responsibility. + MmappedBuffer::MmappedBufferPtr mmappedBuffer( + MmappedBuffer::openBuffer(path, bufOffset, size, false /* isUpdatable */)); + if (!mmappedBuffer) { + return nullptr; + } + switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { + case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + break; + case FormatUtils::VERSION_202: + return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( + new PatriciaTriePolicy(std::move(mmappedBuffer))); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: + AKLOGE("Given path is a file but the format is version 4. path: %s", path); + break; + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); + break; + } + ASSERT(false); + return nullptr; +} + +/* static */ void DictionaryStructureWithBufferPolicyFactory::getHeaderFilePathInDictDir( + const char *const dictDirPath, const int outHeaderFileBufSize, + char *const outHeaderFilePath) { + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + snprintf(outHeaderFilePath, outHeaderFileBufSize, "%s/%s%s", dictDirPath, + dictName, Ver4DictConstants::HEADER_FILE_EXTENSION); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h b/app/src/main/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h new file mode 100644 index 000000000..b0c04c0b1 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H +#define LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicyFactory { + public: + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForExistingDictFile(const char *const path, const int bufOffset, + const int size, const bool isUpdatable); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForOnMemoryDict(const int formatVersion, const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); + + template + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForOnMemoryV4Dict(const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &locale, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForDirectoryDict(const char *const path, const bool isUpdatable); + + template + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr newPolicyForV4Dict( + const char *const headerFilePath, const FormatUtils::FORMAT_VERSION formatVersion, + MmappedBuffer::MmappedBufferPtr &&mmappedBuffer); + + static DictionaryStructureWithBufferPolicy::StructurePolicyPtr + newPolicyForFileDict(const char *const path, const int bufOffset, const int size); + + static void getHeaderFilePathInDictDir(const char *const dirPath, + const int outHeaderFileBufSize, char *const outHeaderFilePath); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp new file mode 100644 index 000000000..64f9b6663 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" + +#include "dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = + 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; +// Flag for presence of more attributes +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT = + 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; + +/* static */ bool BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + const ReadOnlyByteArrayView buffer, BigramFlags *const outBigramFlags, + int *const outTargetPtNodePos, int *const bigramEntryPos) { + if (static_cast(buffer.size()) <= *bigramEntryPos) { + AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %zd, " + "bigramEntryPos: %d.", buffer.size(), *bigramEntryPos); + return false; + } + const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), + bigramEntryPos); + if (outBigramFlags) { + *outBigramFlags = bigramFlags; + } + const int targetPos = getBigramAddressAndAdvancePosition(buffer, bigramFlags, bigramEntryPos); + if (outTargetPtNodePos) { + *outTargetPtNodePos = targetPos; + } + return true; +} + +/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const ReadOnlyByteArrayView buffer, + int *const bigramListPos) { + BigramFlags flags; + do { + if (!getBigramEntryPropertiesAndAdvancePosition(buffer, &flags, 0 /* outTargetPtNodePos */, + bigramListPos)) { + return false; + } + } while(hasNext(flags)); + return true; +} + +/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition( + const ReadOnlyByteArrayView buffer, const BigramFlags flags, int *const pos) { + int offset = 0; + const int origin = *pos; + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer.data(), pos); + break; + } + if (isOffsetNegative(flags)) { + return origin - offset; + } else { + return origin + offset; + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h b/app/src/main/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h new file mode 100644 index 000000000..a0f7d5e83 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H +#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H + +#include +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class BigramListReadWriteUtils { +public: + typedef uint8_t BigramFlags; + + static bool getBigramEntryPropertiesAndAdvancePosition(const ReadOnlyByteArrayView buffer, + BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, + int *const bigramEntryPos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // Bigrams reading methods + static bool skipExistingBigrams(const ReadOnlyByteArrayView buffer, int *const bigramListPos); + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); + + static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; + + static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; + } + + static int getBigramAddressAndAdvancePosition(const ReadOnlyByteArrayView buffer, + const BigramFlags flags, int *const pos); +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp new file mode 100644 index 000000000..b5e2e9dae --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" + +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" + +namespace latinime { + +bool DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless + // children. + bool isUselessPtNode = !ptNodeParams->isTerminal(); + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { + bool needsToKeepPtNode = true; + if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + ptNodeParams, &needsToKeepPtNode)) { + AKLOGE("Cannot update PtNode probability or get needs to keep PtNode after GC."); + return false; + } + if (!needsToKeepPtNode) { + isUselessPtNode = true; + } + } + if (mChildrenValue > 0) { + isUselessPtNode = false; + } else if (ptNodeParams->isTerminal()) { + // Remove children as all children are useless. + if (!mPtNodeWriter->updateChildrenPosition(ptNodeParams, + NOT_A_DICT_POS /* newChildrenPosition */)) { + return false; + } + } + if (isUselessPtNode) { + // Current PtNode is no longer needed. Mark it as deleted. + if (!mPtNodeWriter->markPtNodeAsDeleted(ptNodeParams)) { + return false; + } + } else { + mValueStack.back() += 1; + if (ptNodeParams->isTerminal() && !ptNodeParams->representsNonWordInfo()) { + mValidUnigramCount += 1; + } + } + return true; +} + +bool DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isDeleted()) { + int bigramEntryCount = 0; + if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams, + &bigramEntryCount)) { + return false; + } + mValidBigramEntryCount += bigramEntryCount; + } + return true; +} + +// Writes dummy PtNode array size when the head of PtNode array is read. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onDescend(const int ptNodeArrayPos) { + mValidPtNodeCount = 0; + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( + PtNodeWriter::PtNodeArrayPositionRelocationMap::value_type(ptNodeArrayPos, writingPos)); + // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. + // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. + mPtNodeArraySizeFieldPos = writingPos; + return DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, 0 /* arraySize */, &writingPos); +} + +// Write PtNode array terminal and actual PtNode array size. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onReadingPtNodeArrayTail() { + int writingPos = mBufferToWrite->getTailPosition(); + // Write PtNode array terminal. + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( + mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Write actual PtNode array size. + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { + return false; + } + return true; +} + +// Write valid PtNode to buffer and memorize mapping from the old position to the new position. +bool DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (ptNodeParams->isDeleted()) { + // Current PtNode is not written in new buffer because it has been deleted. + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + PtNodeWriter::PtNodePositionRelocationMap::value_type( + ptNodeParams->getHeadPos(), NOT_A_DICT_POS)); + return true; + } + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + PtNodeWriter::PtNodePositionRelocationMap::value_type( + ptNodeParams->getHeadPos(), writingPos)); + mValidPtNodeCount++; + // Writes current PtNode. + return mPtNodeWriter->writePtNodeAndAdvancePosition(ptNodeParams, &writingPos); +} + +bool DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + // Updates parent position. + int bigramCount = 0; + if (!mPtNodeWriter->updateAllPositionFields(ptNodeParams, mDictPositionRelocationMap, + &bigramCount)) { + return false; + } + mBigramCount += bigramCount; + if (ptNodeParams->isTerminal()) { + mUnigramCount++; + } + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h new file mode 100644 index 000000000..8c7ad965b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H +#define LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +class PtNodeParams; + +class DynamicPtGcEventListeners { + public: + // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or + // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC. + // TODO: Concatenate non-terminal PtNodes. + class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + PtNodeWriter *const ptNodeWriter) + : mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0), + mValidUnigramCount(0) {} + + ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; + + bool onAscend() { + if (mValueStack.empty()) { + return false; + } + mChildrenValue = mValueStack.back(); + mValueStack.pop_back(); + return true; + } + + bool onDescend(const int ptNodeArrayPos) { + mValueStack.push_back(0); + mChildrenValue = 0; + return true; + } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + int getValidUnigramCount() const { + return mValidUnigramCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS( + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + + PtNodeWriter *const mPtNodeWriter; + std::vector mValueStack; + int mChildrenValue; + int mValidUnigramCount; + }; + + // TODO: Remove when we stop supporting v402 format. + // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram + // entries. + class TraversePolicyToUpdateBigramProbability + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateBigramProbability(PtNodeWriter *const ptNodeWriter) + : mPtNodeWriter(ptNodeWriter), mValidBigramEntryCount(0) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + int getValidBigramEntryCount() const { + return mValidBigramEntryCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); + + PtNodeWriter *const mPtNodeWriter; + int mValidBigramEntryCount; + }; + + class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToPlaceAndWriteValidPtNodesToBuffer( + PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const bufferToWrite, + PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) + : mPtNodeWriter(ptNodeWriter), mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0), + mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos); + + bool onReadingPtNodeArrayTail(); + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer); + + PtNodeWriter *const mPtNodeWriter; + BufferWithExtendableBuffer *const mBufferToWrite; + PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; + int mValidPtNodeCount; + int mPtNodeArraySizeFieldPos; + }; + + class TraversePolicyToUpdateAllPositionFields + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPositionFields(PtNodeWriter *const ptNodeWriter, + const PtNodeWriter::DictPositionRelocationMap *const dictPositionRelocationMap) + : mPtNodeWriter(ptNodeWriter), + mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0), + mBigramCount(0) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + int getUnigramCount() const { + return mUnigramCount; + } + + int getBigramCount() const { + return mBigramCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); + + PtNodeWriter *const mPtNodeWriter; + const PtNodeWriter::DictPositionRelocationMap *const mDictPositionRelocationMap; + int mUnigramCount; + int mBigramCount; + }; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtGcEventListeners); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_GC_EVENT_LISTENERS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp new file mode 100644 index 000000000..294bc6ea9 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp @@ -0,0 +1,321 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" + +#include "dictionary/structure/pt_common/pt_node_array_reader.h" +#include "utils/char_utils.h" + +namespace latinime { + +// To avoid infinite loop caused by invalid or malicious forward links. +const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; + +bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode( + const PtNodeParams *const ptNodeParams) { + if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) { + mTerminalPositions->push_back(ptNodeParams->getHeadPos()); + } + return true; +} + +// Visits all PtNodes in post-order depth first manner. +// For example, visits c -> b -> y -> x -> a for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + if (!alreadyVisitedChildren) { + if (ptNodeParams.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(ptNodeParams.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(ptNodeParams); + } else { + alreadyVisitedChildren = true; + } + } else { + if (!listener->onVisitingPtNode(&ptNodeParams)) { + return false; + } + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + // All PtNodes in current linked PtNode arrays have been visited. + // Return to the parent. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + if (!listener->onAscend()) { + return false; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + } else { + // Process sibling PtNode. + alreadyVisitedChildren = false; + } + } + } + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order +// that PtNodes are written in the dictionary buffer. +// For example, visits a -> b -> x -> c -> y for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedAllPtNodesInArray = false; + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + if (isEnd()) { + // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + } + pushReadingStateToStack(); + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + if (alreadyVisitedAllPtNodesInArray) { + if (alreadyVisitedChildren) { + // Move to next sibling PtNode's children. + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + // Return to the parent PTNode. + if (!listener->onAscend()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + alreadyVisitedAllPtNodesInArray = true; + } else { + alreadyVisitedChildren = false; + } + } else { + if (ptNodeParams.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(ptNodeParams.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(ptNodeParams); + // Push state to return the head of PtNode array. + pushReadingStateToStack(); + alreadyVisitedAllPtNodesInArray = false; + alreadyVisitedChildren = false; + } else { + alreadyVisitedChildren = true; + } + } + } else { + if (!listener->onVisitingPtNode(&ptNodeParams)) { + return false; + } + readNextSiblingNode(ptNodeParams); + if (isEnd()) { + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + // Return to the head of current PtNode array. + popReadingStateFromStack(); + alreadyVisitedAllPtNodesInArray = true; + } + } + } + popReadingStateFromStack(); + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount, + int *const outCodePoints) { + // This method traverses parent nodes from the terminal by following parent pointers; thus, + // node code points are stored in the buffer in the reverse order. + int reverseCodePoints[maxCodePointCount]; + const PtNodeParams terminalPtNodeParams(getPtNodeParams()); + // First, read the terminal node and get its probability. + if (!isValidTerminalNode(terminalPtNodeParams)) { + // Node at the ptNodePos is not a valid terminal node. + return 0; + } + // Then, following parent node link to the dictionary root and fetch node code points. + int totalCodePointCount = 0; + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + totalCodePointCount = getTotalCodePointCount(ptNodeParams); + if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) { + // The ptNodePos is not a valid terminal node position in the dictionary. + return 0; + } + // Store node code points to buffer in the reverse order. + fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(), + reverseCodePoints); + // Follow parent node toward the root node. + readParentNode(ptNodeParams); + } + if (isError()) { + // The node position or the dictionary is invalid. + return 0; + } + // Reverse the stored code points to output them. + for (int i = 0; i < totalCodePointCount; ++i) { + outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1]; + } + return totalCodePointCount; +} + +int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord, + const size_t length, const bool forceLowerCaseSearch) { + int searchCodePoints[length]; + for (size_t i = 0; i < length; ++i) { + searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; + } + while (!isEnd()) { + const PtNodeParams ptNodeParams(getPtNodeParams()); + const int matchedCodePointCount = getPrevTotalCodePointCount(); + if (getTotalCodePointCount(ptNodeParams) > length + || !isMatchedCodePoint(ptNodeParams, 0 /* index */, + searchCodePoints[matchedCodePointCount])) { + // Current node has too many code points or its first code point is different from + // target code point. Skip this node and read the next sibling node. + readNextSiblingNode(ptNodeParams); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) { + // Different code point is found. The given word is not included in the dictionary. + return NOT_A_DICT_POS; + } + } + // All characters are matched. + if (length == getTotalCodePointCount(ptNodeParams)) { + if (!ptNodeParams.isTerminal()) { + return NOT_A_DICT_POS; + } + // Terminal position is found. + return ptNodeParams.getHeadPos(); + } + if (!ptNodeParams.hasChildren()) { + return NOT_A_DICT_POS; + } + // Advance to the children nodes. + readChildNode(ptNodeParams); + } + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + return NOT_A_DICT_POS; +} + +// Read node array size and process empty node arrays. Nodes and arrays are counted up in this +// method to avoid an infinite loop. +void DynamicPtReadingHelper::nextPtNodeArray() { + int ptNodeCountInArray = 0; + int firstPtNodePos = NOT_A_DICT_POS; + if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid( + mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) { + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos; + mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray; + mReadingState.mPos = firstPtNodePos; + // Count up nodes and node arrays to avoid infinite loop. + mReadingState.mTotalPtNodeIndexInThisArrayChain += + mReadingState.mRemainingPtNodeCountInThisArray; + mReadingState.mPtNodeArrayIndexInThisArrayChain++; + if (mReadingState.mRemainingPtNodeCountInThisArray < 0 + || mReadingState.mTotalPtNodeIndexInThisArrayChain + > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mReadingState.mPtNodeArrayIndexInThisArrayChain + > MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + // Invalid dictionary. + AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" + "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", + mReadingState.mRemainingPtNodeCountInThisArray, + mReadingState.mTotalPtNodeIndexInThisArrayChain, + MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, + mReadingState.mPtNodeArrayIndexInThisArrayChain, + MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + if (mReadingState.mRemainingPtNodeCountInThisArray == 0) { + // Empty node array. Try following forward link. + followForwardLink(); + } +} + +// Follow the forward link and read the next node array if exists. +void DynamicPtReadingHelper::followForwardLink() { + int nextPtNodeArrayPos = NOT_A_DICT_POS; + if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid( + mReadingState.mPos, &nextPtNodeArrayPos)) { + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; + if (nextPtNodeArrayPos != NOT_A_DICT_POS) { + // Follow the forward link. + mReadingState.mPos = nextPtNodeArrayPos; + nextPtNodeArray(); + } else { + // All node arrays have been read. + mReadingState.mPos = NOT_A_DICT_POS; + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h new file mode 100644 index 000000000..d8ddc7c2b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_READING_HELPER_H +#define LATINIME_DYNAMIC_PT_READING_HELPER_H + +#include +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { + +class DictionaryShortcutsStructurePolicy; +class PtNodeArrayReader; + +/* + * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and + * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. + */ +class DynamicPtReadingHelper { + public: + class TraversingEventListener { + public: + virtual ~TraversingEventListener() {}; + + // Returns whether the event handling was succeeded or not. + virtual bool onAscend() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onDescend(const int ptNodeArrayPos) = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onReadingPtNodeArrayTail() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onVisitingPtNode(const PtNodeParams *const node) = 0; + + protected: + TraversingEventListener() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); + }; + + class TraversePolicyToGetAllTerminalPtNodePositions : public TraversingEventListener { + public: + TraversePolicyToGetAllTerminalPtNodePositions(std::vector *const terminalPositions) + : mTerminalPositions(terminalPositions) {} + bool onAscend() { return true; } + bool onDescend(const int ptNodeArrayPos) { return true; } + bool onReadingPtNodeArrayTail() { return true; } + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToGetAllTerminalPtNodePositions); + + std::vector *const mTerminalPositions; + }; + + DynamicPtReadingHelper(const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader) + : mIsError(false), mReadingState(), mPtNodeReader(ptNodeReader), + mPtNodeArrayReader(ptNodeArrayReader), mReadingStateStack() {} + + ~DynamicPtReadingHelper() {} + + AK_FORCE_INLINE bool isError() const { + return mIsError; + } + + AK_FORCE_INLINE bool isEnd() const { + return mReadingState.mPos == NOT_A_DICT_POS; + } + + // Initialize reading state with the head position of a PtNode array. + AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { + if (ptNodeArrayPos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodeArrayPos; + mReadingState.mTotalCodePointCountSinceInitialization = 0; + mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingStateStack.clear(); + nextPtNodeArray(); + } + } + + // Initialize reading state with the head position of a node. + AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { + if (ptNodePos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodePos; + mReadingState.mRemainingPtNodeCountInThisArray = 1; + mReadingState.mTotalCodePointCountSinceInitialization = 0; + mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; + mReadingStateStack.clear(); + } + } + + AK_FORCE_INLINE const PtNodeParams getPtNodeParams() const { + if (isEnd()) { + return PtNodeParams(); + } + return mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(mReadingState.mPos); + } + + AK_FORCE_INLINE bool isValidTerminalNode(const PtNodeParams &ptNodeParams) const { + return !isEnd() && !ptNodeParams.isDeleted() && ptNodeParams.isTerminal(); + } + + AK_FORCE_INLINE bool isMatchedCodePoint(const PtNodeParams &ptNodeParams, const int index, + const int codePoint) const { + return ptNodeParams.getCodePoints()[index] == codePoint; + } + + // Return code point count exclude the last read node's code points. + AK_FORCE_INLINE size_t getPrevTotalCodePointCount() const { + return mReadingState.mTotalCodePointCountSinceInitialization; + } + + // Return code point count include the last read node's code points. + AK_FORCE_INLINE size_t getTotalCodePointCount(const PtNodeParams &ptNodeParams) const { + return mReadingState.mTotalCodePointCountSinceInitialization + + ptNodeParams.getCodePointCount(); + } + + AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder(const PtNodeParams &ptNodeParams, + const int index, int *const outCodePoints) const { + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + const int *const nodeCodePoints = ptNodeParams.getCodePoints(); + for (int i = 0; i < nodeCodePointCount; ++i) { + outCodePoints[index + i] = nodeCodePoints[nodeCodePointCount - 1 - i]; + } + } + + AK_FORCE_INLINE void readNextSiblingNode(const PtNodeParams &ptNodeParams) { + mReadingState.mRemainingPtNodeCountInThisArray -= 1; + mReadingState.mPos = ptNodeParams.getSiblingNodePos(); + if (mReadingState.mRemainingPtNodeCountInThisArray <= 0) { + // All nodes in the current node array have been read. + followForwardLink(); + } + } + + // Read the first child node of the current node. + AK_FORCE_INLINE void readChildNode(const PtNodeParams &ptNodeParams) { + if (ptNodeParams.hasChildren()) { + mReadingState.mTotalCodePointCountSinceInitialization += + ptNodeParams.getCodePointCount(); + mReadingState.mTotalPtNodeIndexInThisArrayChain = 0; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 0; + mReadingState.mPos = ptNodeParams.getChildrenPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + // Read children node array. + nextPtNodeArray(); + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + // Read the parent node of the current node. + AK_FORCE_INLINE void readParentNode(const PtNodeParams &ptNodeParams) { + if (ptNodeParams.getParentPos() != NOT_A_DICT_POS) { + mReadingState.mTotalCodePointCountSinceInitialization += + ptNodeParams.getCodePointCount(); + mReadingState.mTotalPtNodeIndexInThisArrayChain = 1; + mReadingState.mPtNodeArrayIndexInThisArrayChain = 1; + mReadingState.mRemainingPtNodeCountInThisArray = 1; + mReadingState.mPos = ptNodeParams.getParentPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfThisPtNodeArrayHead = NOT_A_DICT_POS; + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { + return mReadingState.mPosOfLastForwardLinkField; + } + + AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { + return mReadingState.mPosOfThisPtNodeArrayHead; + } + + bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); + + bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener); + + int getCodePointsAndReturnCodePointCount(const int maxCodePointCount, int *const outCodePoints); + + int getTerminalPtNodePositionOfWord(const int *const inWord, const size_t length, + const bool forceLowerCaseSearch); + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPtReadingHelper); + + // This class encapsulates the reading state of a position in the dictionary. It points at a + // specific PtNode in the dictionary. + class PtNodeReadingState { + public: + // Note that copy constructor and assignment operator are used for this class to use + // std::vector. + PtNodeReadingState() : mPos(NOT_A_DICT_POS), mRemainingPtNodeCountInThisArray(0), + mTotalCodePointCountSinceInitialization(0), mTotalPtNodeIndexInThisArrayChain(0), + mPtNodeArrayIndexInThisArrayChain(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), + mPosOfThisPtNodeArrayHead(NOT_A_DICT_POS) {} + + int mPos; + // Remaining node count in the current array. + int mRemainingPtNodeCountInThisArray; + size_t mTotalCodePointCountSinceInitialization; + // Counter of PtNodes used to avoid infinite loops caused by broken or malicious links. + int mTotalPtNodeIndexInThisArrayChain; + // Counter of PtNode arrays used to avoid infinite loops caused by cyclic links of empty + // PtNode arrays. + int mPtNodeArrayIndexInThisArrayChain; + int mPosOfLastForwardLinkField; + int mPosOfThisPtNodeArrayHead; + }; + + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + static const int MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + static const size_t MAX_READING_STATE_STACK_SIZE; + + // TODO: Introduce error code to track what caused the error. + bool mIsError; + PtNodeReadingState mReadingState; + const PtNodeReader *const mPtNodeReader; + const PtNodeArrayReader *const mPtNodeArrayReader; + std::vector mReadingStateStack; + + void nextPtNodeArray(); + + void followForwardLink(); + + AK_FORCE_INLINE void pushReadingStateToStack() { + if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { + AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingStateStack.push_back(mReadingState); + } + } + + AK_FORCE_INLINE void popReadingStateFromStack() { + if (mReadingStateStack.empty()) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingState = mReadingStateStack.back(); + mReadingStateStack.pop_back(); + } + } +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_READING_HELPER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp new file mode 100644 index 000000000..3eb55ed9b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" + +#include "defines.h" +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::MASK_MOVED = 0xC0; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_MOVED = 0x40; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_IS_DELETED = 0x80; +const DynamicPtReadingUtils::NodeFlags DynamicPtReadingUtils::FLAG_WILL_BECOME_NON_TERMINAL = 0x00; + +// TODO: Make DICT_OFFSET_ZERO_OFFSET = 0. +// Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum +// value of offsets, which is 0x7FFFFF is used to represent 0 offset. +const int DynamicPtReadingUtils::DICT_OFFSET_INVALID = 0; +const int DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; + +/* static */ int DynamicPtReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, + const int pos) { + int linkAddressPos = pos; + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos); +} + +/* static */ int DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); +} + +/* static */ int DynamicPtReadingUtils::getParentPtNodePos(const int parentOffset, + const int ptNodePos) { + if (parentOffset == DICT_OFFSET_INVALID) { + return NOT_A_DICT_POS; + } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) { + return ptNodePos; + } else { + return parentOffset + ptNodePos; + } +} + +/* static */ int DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const int base = *pos; + const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + if (offset == DICT_OFFSET_INVALID) { + // The PtNode does not have children. + return NOT_A_DICT_POS; + } else if (offset == DICT_OFFSET_ZERO_OFFSET) { + return base; + } else { + return base + offset; + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h new file mode 100644 index 000000000..b13a075d5 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_READING_UTILS_H +#define LATINIME_DYNAMIC_PT_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +class DynamicPtReadingUtils { + public: + typedef uint8_t NodeFlags; + + static const int DICT_OFFSET_INVALID; + static const int DICT_OFFSET_ZERO_OFFSET; + + static int getForwardLinkPosition(const uint8_t *const buffer, const int pos); + + static AK_FORCE_INLINE bool isValidForwardLinkPosition(const int forwardLinkAddress) { + return forwardLinkAddress != 0; + } + + static int getParentPtNodePosOffsetAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + static int getParentPtNodePos(const int parentOffset, const int ptNodePos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isMoved(const NodeFlags flags) { + return FLAG_IS_MOVED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE bool isDeleted(const NodeFlags flags) { + return FLAG_IS_DELETED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE bool willBecomeNonTerminal(const NodeFlags flags) { + return FLAG_WILL_BECOME_NON_TERMINAL == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE NodeFlags updateAndGetFlags(const NodeFlags originalFlags, + const bool isMoved, const bool isDeleted, const bool willBecomeNonTerminal) { + NodeFlags flags = originalFlags; + flags = willBecomeNonTerminal ? + ((flags & (~MASK_MOVED)) | FLAG_WILL_BECOME_NON_TERMINAL) : flags; + flags = isMoved ? ((flags & (~MASK_MOVED)) | FLAG_IS_MOVED) : flags; + flags = isDeleted ? ((flags & (~MASK_MOVED)) | FLAG_IS_DELETED) : flags; + flags = (!isMoved && !isDeleted && !willBecomeNonTerminal) ? + ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags; + return flags; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtReadingUtils); + + static const NodeFlags MASK_MOVED; + static const NodeFlags FLAG_IS_NOT_MOVED; + static const NodeFlags FLAG_IS_MOVED; + static const NodeFlags FLAG_IS_DELETED; + static const NodeFlags FLAG_WILL_BECOME_NON_TERMINAL; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_READING_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp new file mode 100644 index 000000000..ccad345c8 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp @@ -0,0 +1,299 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" + +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper, + const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, + bool *const outAddedNewUnigram) { + int parentPos = NOT_A_DICT_POS; + while (!readingHelper->isEnd()) { + const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(ptNodeParams); + continue; + } + // Check following merged node code points. + const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size(); + for (size_t j = 1; j < nodeCodePointCount; ++j) { + const size_t nextIndex = matchedCodePointCount + j; + if (nextIndex >= wordCodePoints.size() + || !readingHelper->isMatchedCodePoint(ptNodeParams, j, + wordCodePoints[matchedCodePointCount + j])) { + *outAddedNewUnigram = true; + return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, + wordCodePoints.skip(matchedCodePointCount)); + } + } + // All characters are matched. + if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) { + return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); + } + if (!ptNodeParams.hasChildren()) { + *outAddedNewUnigram = true; + return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, + wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams))); + } + // Advance to the children nodes. + parentPos = ptNodeParams.getHeadPos(); + readingHelper->readChildNode(ptNodeParams); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + int pos = readingHelper->getPosOfLastForwardLinkField(); + *outAddedNewUnigram = true; + return createAndInsertNodeIntoPtNodeArray(parentPos, + wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty, + &pos); +} + +bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, + const int wordPos, const NgramProperty *const ngramProperty, + bool *const outAddedNewEntry) { + if (prevWordsPtNodePos.empty()) { + return false; + } + ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); + int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { + prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( + prevWordsPtNodePos[i]).getTerminalId(); + } + const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); + const int wordId = + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); + return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry); +} + +bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, + const int wordPos) { + if (prevWordsPtNodePos.empty()) { + return false; + } + ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); + int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { + prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( + prevWordsPtNodePos[i]).getTerminalId(); + } + const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); + const int wordId = + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); + return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId); +} + +bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, + const CodePointArrayView targetCodePoints, const int shortcutProbability) { + const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); + return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(), + targetCodePoints.size(), shortcutProbability); +} + +bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, + const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, + int *const forwardLinkFieldPos) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, forwardLinkFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty); +} + +bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { + if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) { + // Overwrites the probability. + *outAddedNewUnigram = false; + return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); + } else { + // Make the node terminal and write the probability. + *outAddedNewUnigram = true; + const int movedPos = mBuffer->getTailPosition(); + int writingPos = movedPos; + const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, originalPtNodeParams->getParentPos(), + originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { + return false; + } + if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { + return false; + } + } + return true; +} + +bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( + const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, + const CodePointArrayView codePoints) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, + unigramProperty); +} + +bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( + const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints, + const UnigramProperty *const unigramProperty) { + int writingPos = mBuffer->getTailPosition(); + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + 1 /* arraySize */, &writingPos)) { + return false; + } + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, parentPtNodePos, ptNodeCodePoints, + unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { + return false; + } + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + return true; +} + +// Returns whether the dictionary updating was succeeded or not. +bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( + const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount, + const UnigramProperty *const unigramProperty, + const CodePointArrayView newPtNodeCodePoints) { + // When addsExtraChild is true, split the reallocating PtNode and add new child. + // Reallocating PtNode: abcde, newNode: abcxy. + // abc (1st, not terminal) __ de (2nd) + // \_ xy (extra child, terminal) + // Otherwise, this method makes 1st part terminal and write information in unigramProperty. + // Reallocating PtNode: abcde, newNode: abc. + // abc (1st, terminal) __ de (2nd) + const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount; + const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); + int writingPos = firstPartOfReallocatedPtNodePos; + // Write the 1st part of the reallocating node. The children position will be updated later + // with actual children position. + const CodePointArrayView firstPtNodeCodePoints = + reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount); + if (addsExtraChild) { + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */, + reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints, + NOT_A_PROBABILITY)); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { + return false; + } + } else { + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), + firstPtNodeCodePoints, unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, + unigramProperty, &writingPos)) { + return false; + } + } + const int actualChildrenPos = writingPos; + // Create new children PtNode array. + const size_t newPtNodeCount = addsExtraChild ? 2 : 1; + if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + newPtNodeCount, &writingPos)) { + return false; + } + // Write the 2nd part of the reallocating node. + const int secondPartOfReallocatedPtNodePos = writingPos; + const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, + reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(), + reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, + reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount), + reallocatingPtNodeParams->getProbability())); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { + return false; + } + if (addsExtraChild) { + const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, firstPartOfReallocatedPtNodePos, + newPtNodeCodePoints.skip(overlappingCodePointCount), + unigramProperty->getProbability())); + if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, + unigramProperty, &writingPos)) { + return false; + } + } + if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Update original reallocating PtNode as moved. + if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, + secondPartOfReallocatedPtNodePos)) { + return false; + } + // Load node info. Information of the 1st part will be fetched. + const PtNodeParams ptNodeParams( + mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); + // Update children position. + return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); +} + +const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( + const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability); +} + +const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(flags, parentPos, codePoints, probability); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h new file mode 100644 index 000000000..e8cf98c39 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_UPDATING_HELPER_H +#define LATINIME_DYNAMIC_PT_UPDATING_HELPER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class NgramProperty; +class BufferWithExtendableBuffer; +class DynamicPtReadingHelper; +class PtNodeReader; +class PtNodeWriter; +class UnigramProperty; + +class DynamicPtUpdatingHelper { + public: + DynamicPtUpdatingHelper(BufferWithExtendableBuffer *const buffer, + const PtNodeReader *const ptNodeReader, PtNodeWriter *const ptNodeWriter) + : mBuffer(buffer), mPtNodeReader(ptNodeReader), mPtNodeWriter(ptNodeWriter) {} + + ~DynamicPtUpdatingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPtReadingHelper *const readingHelper, + const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, + bool *const outAddedNewUnigram); + + // TODO: Remove after stopping supporting v402. + // Add an n-gram entry. + bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + // TODO: Remove after stopping supporting v402. + // Remove an n-gram entry. + bool removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos); + + // Add a shortcut target. + bool addShortcutTarget(const int wordPos, const CodePointArrayView targetCodePoints, + const int shortcutProbability); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtUpdatingHelper); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mBuffer; + const PtNodeReader *const mPtNodeReader; + PtNodeWriter *const mPtNodeWriter; + + bool createAndInsertNodeIntoPtNodeArray(const int parentPos, + const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, + int *const forwardLinkFieldPos); + + bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, + const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); + + bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, + const UnigramProperty *const unigramProperty, + const CodePointArrayView remainingCodePoints); + + bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, + const CodePointArrayView ptNodeCodePoints, + const UnigramProperty *const unigramProperty); + + bool reallocatePtNodeAndAddNewPtNodes(const PtNodeParams *const reallocatingPtNodeParams, + const size_t overlappingCodePointCount, const UnigramProperty *const unigramProperty, + const CodePointArrayView newPtNodeCodePoints); + + const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, + const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal, + const int parentPos, const CodePointArrayView codePoints, const int probability) const; + + const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp new file mode 100644 index 000000000..ea760a538 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" + +#include +#include +#include + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F; +const size_t DynamicPtWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF; +const int DynamicPtWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1; +const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2; +const int DynamicPtWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; +const int DynamicPtWritingUtils::DICT_OFFSET_FIELD_SIZE = 3; +const int DynamicPtWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF; +const int DynamicPtWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF; +const int DynamicPtWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; +const int DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE = 1; + +/* static */ bool DynamicPtWritingUtils::writeEmptyDictionary( + BufferWithExtendableBuffer *const buffer, const int rootPos) { + int writingPos = rootPos; + if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) { + return false; + } + return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */, + &writingPos); +} + +/* static */ bool DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos) { + return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const size_t arraySize, + int *const arraySizeFieldPos) { + // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to + // simplify updating process. + // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays. + /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { + return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else */ + if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { + uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else { + AKLOGI("PtNode array size cannot be written because arraySize is too large: %zd", + arraySize); + ASSERT(false); + return false; + } +} + +/* static */ bool DynamicPtWritingUtils::writeFlagsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, + const DynamicPtReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) { + return buffer->writeUintAndAdvancePosition(nodeFlags, NODE_FLAG_FIELD_SIZE, nodeFlagsFieldPos); +} + +// Note that parentOffset is offset from node's head position. +/* static */ bool DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos, + int *const parentPosFieldPos) { + return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writeCodePointsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int *const codePoints, + const int codePointCount, int *const codePointFieldPos) { + if (codePointCount <= 0) { + AKLOGI("code points cannot be written because codePointCount is invalid: %d", + codePointCount); + ASSERT(false); + return false; + } + const bool hasMultipleCodePoints = codePointCount > 1; + return buffer->writeCodePointsAndAdvancePosition(codePoints, codePointCount, + hasMultipleCodePoints, codePointFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int childrenPosition, + int *const childrenPositionFieldPos) { + return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos), + childrenPositionFieldPos); +} + +/* static */ bool DynamicPtWritingUtils::writeDictOffset(BufferWithExtendableBuffer *const buffer, + const int targetPos, const int basePos, int *const offsetFieldPos) { + int offset = targetPos - basePos; + if (targetPos == NOT_A_DICT_POS) { + offset = DynamicPtReadingUtils::DICT_OFFSET_INVALID; + } else if (offset == 0) { + offset = DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } + if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { + AKLOGI("offset cannot be written because the offset is too large or too small: %d", + offset); + ASSERT(false); + return false; + } + uint32_t data = 0; + if (offset >= 0) { + data = offset; + } else { + data = abs(offset) | DICT_OFFSET_NEGATIVE_FLAG; + } + return buffer->writeUintAndAdvancePosition(data, DICT_OFFSET_FIELD_SIZE, offsetFieldPos); +} +} diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h new file mode 100644 index 000000000..b4817af41 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PT_WRITING_UTILS_H +#define LATINIME_DYNAMIC_PT_WRITING_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DynamicPtWritingUtils { + public: + static const int NODE_FLAG_FIELD_SIZE; + + static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos); + + static bool writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos); + + static bool writePtNodeArraySizeAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const size_t arraySize, int *const arraySizeFieldPos); + + static bool writeFlags(BufferWithExtendableBuffer *const buffer, + const DynamicPtReadingUtils::NodeFlags nodeFlags, + const int nodeFlagsFieldPos) { + int writingPos = nodeFlagsFieldPos; + return writeFlagsAndAdvancePosition(buffer, nodeFlags, &writingPos); + } + + static bool writeFlagsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const DynamicPtReadingUtils::NodeFlags nodeFlags, + int *const nodeFlagsFieldPos); + + static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int parentPosition, const int basePos, int *const parentPosFieldPos); + + static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int *const codePoints, const int codePointCount, int *const codePointFieldPos); + + static bool writeChildrenPositionAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int childrenPosition, int *const childrenPositionFieldPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtWritingUtils); + + static const size_t MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD; + static const size_t MAX_PTNODE_ARRAY_SIZE; + static const int SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + static const int DICT_OFFSET_FIELD_SIZE; + static const int MAX_DICT_OFFSET_VALUE; + static const int MIN_DICT_OFFSET_VALUE; + static const int DICT_OFFSET_NEGATIVE_FLAG; + + static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, + const int basePos, int *const offsetFieldPos); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PT_WRITING_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..e2807c492 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +typedef PatriciaTrieReadingUtils PtReadingUtils; + +const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; + +// Flag for single/multiple char group +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; +// Flag for terminal PtNodes +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; +// Flag for shortcut targets presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; +// Flag for bigram presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; +// Flag for non-words (typically, shortcut only entries) +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; +// Flag for possibly offensive words +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; + +/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + if (firstByte < 0x80) { + return firstByte; + } else { + return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( + buffer, pos); + } +} + +/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, + const int *const codePointTable, int *const pos) { + return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos); +} + +// Returns the number of read characters. +/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, const int maxLength, const int *const codePointTable, + int *const outBuffer, int *const pos) { + int length = 0; + if (hasMultipleChars(flags)) { + length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable, + outBuffer, pos); + } else { + const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos); + if (codePoint == NOT_A_CODE_POINT) { + // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is + // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR + // when the PtNode has a single code point. + length = 0; + AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", + *pos - 1, codePoint, buffer[*pos - 1]); + ASSERT(false); + } else if (maxLength > 0) { + outBuffer[0] = codePoint; + length = 1; + } + } + return length; +} + +// Returns the number of skipped characters. +/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, const int *const codePointTable, int *const pos) { + if (hasMultipleChars(flags)) { + return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); + } else { + if (maxLength > 0) { + getCodePointAndAdvancePosition(buffer, codePointTable, pos); + return 1; + } else { + return 0; + } + } +} + +/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, const NodeFlags flags, int *const pos) { + const int base = *pos; + int offset = 0; + switch (MASK_CHILDREN_POSITION_TYPE & flags) { + case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); + break; + default: + // If we come here, it means we asked for the children of a word with + // no children. + return NOT_A_DICT_POS; + } + return base + offset; +} + +/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable, + NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, + int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, + int *const outBigramPos, int *const outSiblingPos) { + int readingPos = ptNodePos; + const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); + *outFlags = flags; + *outCodePointCount = getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos); + *outProbability = isTerminal(flags) ? + readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; + *outChildrenPos = hasChildrenInFlags(flags) ? + readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS; + *outShortcutPos = NOT_A_DICT_POS; + if (hasShortcutTargets(flags)) { + *outShortcutPos = readingPos; + shortcutPolicy->skipAllShortcuts(&readingPos); + } + *outBigramPos = NOT_A_DICT_POS; + if (hasBigrams(flags)) { + *outBigramPos = readingPos; + bigramPolicy->skipAllBigrams(&readingPos); + } + *outSiblingPos = readingPos; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/app/src/main/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h new file mode 100644 index 000000000..6a2bf5d3c --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_PATRICIA_TRIE_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +class DictionaryShortcutsStructurePolicy; +class DictionaryBigramsStructurePolicy; + +class PatriciaTrieReadingUtils { + public: + typedef uint8_t NodeFlags; + + static int getPtNodeArraySizeAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int getCodePointAndAdvancePosition(const uint8_t *const buffer, + const int *const codePointTable, int *const pos); + + // Returns the number of read characters. + static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos); + + // Returns the number of skipped characters. + static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, const int *const codePointTable, int *const pos); + + static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) { + return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0; + } + + static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) { + return (flags & FLAG_IS_NOT_A_WORD) != 0; + } + + static AK_FORCE_INLINE bool isTerminal(const NodeFlags flags) { + return (flags & FLAG_IS_TERMINAL) != 0; + } + + static AK_FORCE_INLINE bool hasShortcutTargets(const NodeFlags flags) { + return (flags & FLAG_HAS_SHORTCUT_TARGETS) != 0; + } + + static AK_FORCE_INLINE bool hasBigrams(const NodeFlags flags) { + return (flags & FLAG_HAS_BIGRAMS) != 0; + } + + static AK_FORCE_INLINE bool hasMultipleChars(const NodeFlags flags) { + return (flags & FLAG_HAS_MULTIPLE_CHARS) != 0; + } + + static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) { + return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags); + } + + static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive, + const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, + const bool hasBigrams, const bool hasMultipleChars, + const int childrenPositionFieldSize) { + NodeFlags nodeFlags = 0; + nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags; + nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags; + nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags; + nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags; + nodeFlags = hasBigrams ? (nodeFlags | FLAG_HAS_BIGRAMS) : nodeFlags; + nodeFlags = hasMultipleChars ? (nodeFlags | FLAG_HAS_MULTIPLE_CHARS) : nodeFlags; + if (childrenPositionFieldSize == 1) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + } else if (childrenPositionFieldSize == 2) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + } else if (childrenPositionFieldSize == 3) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + } else { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + } + return nodeFlags; + } + + static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, + const int *const codePointTable, NodeFlags *const outFlags, + int *const outCodePointCount, int *const outCodePoint, int *const outProbability, + int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos, + int *const outSiblingPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); + + static const NodeFlags MASK_CHILDREN_POSITION_TYPE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + + static const NodeFlags FLAG_HAS_MULTIPLE_CHARS; + static const NodeFlags FLAG_IS_TERMINAL; + static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS; + static const NodeFlags FLAG_HAS_BIGRAMS; + static const NodeFlags FLAG_IS_NOT_A_WORD; + static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE; +}; +} // namespace latinime +#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h new file mode 100644 index 000000000..6078d8285 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_ARRAY_READER_H +#define LATINIME_PT_NODE_ARRAY_READER_H + +#include "defines.h" + +namespace latinime { + +// Interface class used to read PtNode array information. +class PtNodeArrayReader { + public: + virtual ~PtNodeArrayReader() {} + + // Returns if the position is valid or not. + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const = 0; + + // Returns if the position is valid or not. NOT_A_DICT_POS is set to outNextPtNodeArrayPos when + // the next array doesn't exist. + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const = 0; + + protected: + PtNodeArrayReader() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeArrayReader); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_params.h b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_params.h new file mode 100644 index 000000000..905deb1bc --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_params.h @@ -0,0 +1,262 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_PARAMS_H +#define LATINIME_PT_NODE_PARAMS_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/char_utils.h" +#include "utils/int_array_view.h" + +namespace latinime { + +// This class has information of a PtNode. This class is immutable. +class PtNodeParams { + public: + // Invalid PtNode. + PtNodeParams() : mHeadPos(NOT_A_DICT_POS), mFlags(0), mHasMovedFlag(false), + mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mCodePoints(), + mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {} + + PtNodeParams(const PtNodeParams& ptNodeParams) + : mHeadPos(ptNodeParams.mHeadPos), mFlags(ptNodeParams.mFlags), + mHasMovedFlag(ptNodeParams.mHasMovedFlag), mParentPos(ptNodeParams.mParentPos), + mCodePointCount(ptNodeParams.mCodePointCount), mCodePoints(), + mTerminalIdFieldPos(ptNodeParams.mTerminalIdFieldPos), + mTerminalId(ptNodeParams.mTerminalId), + mProbabilityFieldPos(ptNodeParams.mProbabilityFieldPos), + mProbability(ptNodeParams.mProbability), + mChildrenPosFieldPos(ptNodeParams.mChildrenPosFieldPos), + mChildrenPos(ptNodeParams.mChildrenPos), + mBigramLinkedNodePos(ptNodeParams.mBigramLinkedNodePos), + mShortcutPos(ptNodeParams.mShortcutPos), mBigramPos(ptNodeParams.mBigramPos), + mSiblingPos(ptNodeParams.mSiblingPos) { + memcpy(mCodePoints, ptNodeParams.getCodePoints(), sizeof(int) * mCodePointCount); + } + + // PtNode read from version 2 dictionary. + PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, + const int codePointCount, const int *const codePoints, const int probability, + const int childrenPos, const int shortcutPos, const int bigramPos, + const int siblingPos) + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(false), mParentPos(NOT_A_DICT_POS), + mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), + mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(childrenPos), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(shortcutPos), + mBigramPos(bigramPos), mSiblingPos(siblingPos) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + // PtNode with a terminal id. + PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, + const int parentPos, const int codePointCount, const int *const codePoints, + const int terminalIdFieldPos, const int terminalId, const int probability, + const int childrenPosFieldPos, const int childrenPos, const int siblingPos) + : mHeadPos(headPos), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), + mCodePointCount(codePointCount), mCodePoints(), + mTerminalIdFieldPos(terminalIdFieldPos), mTerminalId(terminalId), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(childrenPosFieldPos), mChildrenPos(childrenPos), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(terminalId), + mBigramPos(terminalId), mSiblingPos(siblingPos) { + memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + } + + // Construct new params by updating existing PtNode params. + PtNodeParams(const PtNodeParams *const ptNodeParams, + const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, + const CodePointArrayView codePoints, const int probability) + : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true), + mParentPos(parentPos), mCodePointCount(codePoints.size()), mCodePoints(), + mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()), + mTerminalId(ptNodeParams->getTerminalId()), + mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()), + mProbability(probability), + mChildrenPosFieldPos(ptNodeParams->getChildrenPosFieldPos()), + mChildrenPos(ptNodeParams->getChildrenPos()), + mBigramLinkedNodePos(ptNodeParams->getBigramLinkedNodePos()), + mShortcutPos(ptNodeParams->getShortcutPos()), + mBigramPos(ptNodeParams->getBigramsPos()), + mSiblingPos(ptNodeParams->getSiblingNodePos()) { + memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); + } + + PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, + const CodePointArrayView codePoints, const int probability) + : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), + mCodePointCount(codePoints.size()), mCodePoints(), + mTerminalIdFieldPos(NOT_A_DICT_POS), + mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), + mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), + mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), + mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) { + memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); + } + + AK_FORCE_INLINE bool isValid() const { + return mCodePointCount > 0; + } + + // Head position of the PtNode + AK_FORCE_INLINE int getHeadPos() const { + return mHeadPos; + } + + // Flags + AK_FORCE_INLINE bool isDeleted() const { + return mHasMovedFlag && DynamicPtReadingUtils::isDeleted(mFlags); + } + + AK_FORCE_INLINE bool willBecomeNonTerminal() const { + return mHasMovedFlag && DynamicPtReadingUtils::willBecomeNonTerminal(mFlags); + } + + AK_FORCE_INLINE bool hasChildren() const { + return mChildrenPos != NOT_A_DICT_POS; + } + + AK_FORCE_INLINE bool isTerminal() const { + return PatriciaTrieReadingUtils::isTerminal(mFlags); + } + + AK_FORCE_INLINE bool isPossiblyOffensive() const { + return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); + } + + AK_FORCE_INLINE bool isNotAWord() const { + return PatriciaTrieReadingUtils::isNotAWord(mFlags); + } + + AK_FORCE_INLINE bool hasBigrams() const { + return PatriciaTrieReadingUtils::hasBigrams(mFlags); + } + + AK_FORCE_INLINE bool hasShortcutTargets() const { + return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags); + } + + AK_FORCE_INLINE bool representsNonWordInfo() const { + return getCodePointCount() > 0 && !CharUtils::isInUnicodeSpace(getCodePoints()[0]) + && isNotAWord(); + } + + AK_FORCE_INLINE int representsBeginningOfSentence() const { + return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE + && isNotAWord(); + } + + // Parent node position + AK_FORCE_INLINE int getParentPos() const { + return mParentPos; + } + + AK_FORCE_INLINE const CodePointArrayView getCodePointArrayView() const { + return CodePointArrayView(mCodePoints, mCodePointCount); + } + + // TODO: Remove + // Number of code points + AK_FORCE_INLINE uint8_t getCodePointCount() const { + return mCodePointCount; + } + + // TODO: Remove + AK_FORCE_INLINE const int *getCodePoints() const { + return mCodePoints; + } + + // Probability + AK_FORCE_INLINE int getTerminalIdFieldPos() const { + return mTerminalIdFieldPos; + } + + AK_FORCE_INLINE int getTerminalId() const { + return mTerminalId; + } + + // Probability + AK_FORCE_INLINE int getProbabilityFieldPos() const { + return mProbabilityFieldPos; + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + // Children PtNode array position + AK_FORCE_INLINE int getChildrenPosFieldPos() const { + return mChildrenPosFieldPos; + } + + AK_FORCE_INLINE int getChildrenPos() const { + return mChildrenPos; + } + + // Bigram linked node position. + AK_FORCE_INLINE int getBigramLinkedNodePos() const { + return mBigramLinkedNodePos; + } + + // Shortcutlist position + AK_FORCE_INLINE int getShortcutPos() const { + return mShortcutPos; + } + + // Bigrams position + AK_FORCE_INLINE int getBigramsPos() const { + return mBigramPos; + } + + // Sibling node position + AK_FORCE_INLINE int getSiblingNodePos() const { + return mSiblingPos; + } + + private: + // This class have a public copy constructor to be used as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(PtNodeParams); + + const int mHeadPos; + const PatriciaTrieReadingUtils::NodeFlags mFlags; + const bool mHasMovedFlag; + const int mParentPos; + const uint8_t mCodePointCount; + int mCodePoints[MAX_WORD_LENGTH]; + const int mTerminalIdFieldPos; + const int mTerminalId; + const int mProbabilityFieldPos; + const int mProbability; + const int mChildrenPosFieldPos; + const int mChildrenPos; + const int mBigramLinkedNodePos; + const int mShortcutPos; + const int mBigramPos; + const int mSiblingPos; +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_PARAMS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_reader.h b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_reader.h new file mode 100644 index 000000000..15da19e0b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_reader.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_READER_H +#define LATINIME_PT_NODE_READER_H + +#include "defines.h" + +#include "dictionary/structure/pt_common/pt_node_params.h" + +namespace latinime { + +// Interface class used to read PtNode information. +class PtNodeReader { + public: + virtual ~PtNodeReader() {} + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos( + const int ptNodePos) const = 0; + + protected: + PtNodeReader() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeReader); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_READER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_writer.h b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_writer.h new file mode 100644 index 000000000..e6cad25aa --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/pt_node_writer.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PT_NODE_WRITER_H +#define LATINIME_PT_NODE_WRITER_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class NgramProperty; +class UnigramProperty; + +// Interface class used to write PtNode information. +class PtNodeWriter { + public: + typedef std::unordered_map PtNodeArrayPositionRelocationMap; + typedef std::unordered_map PtNodePositionRelocationMap; + struct DictPositionRelocationMap { + public: + DictPositionRelocationMap() + : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} + + PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; + PtNodePositionRelocationMap mPtNodePositionRelocationMap; + + private: + DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); + }; + + virtual ~PtNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) = 0; + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) = 0; + + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) = 0; + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, + bool *const outNeedsToKeepPtNode) = 0; + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition) = 0; + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos) = 0; + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0; + + virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) = 0; + + virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0; + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) = 0; + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) = 0; + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) = 0; + + protected: + PtNodeWriter() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(PtNodeWriter); +}; +} // namespace latinime +#endif /* LATINIME_PT_NODE_WRITER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/app/src/main/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp new file mode 100644 index 000000000..14428edd4 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// Flag for presence of more attributes +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; +// The numeric value of the shortcut probability that means 'whitelist'. +const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; + +/* static */ ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); +} + +/* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer( + const ReadOnlyByteArrayView buffer, int *const pos) { + // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. + return ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos) + - SHORTCUT_LIST_SIZE_FIELD_SIZE; +} + +/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer, + const int maxLength, int *const outWord, int *const pos) { + // TODO: Use codePointTable for shortcuts. + return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, + nullptr /* codePointTable */, outWord, pos); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h b/app/src/main/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h new file mode 100644 index 000000000..71cb8cc2c --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_READING_UTILS_H +#define LATINIME_SHORTCUT_LIST_READING_UTILS_H + +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class ShortcutListReadingUtils { + public: + typedef uint8_t ShortcutFlags; + + static ShortcutFlags getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const ShortcutFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // This method returns the size of the shortcut list region excluding the shortcut list size + // field at the beginning. + static int getShortcutListSizeAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos); + + static AK_FORCE_INLINE int getShortcutListSizeFieldSize() { + return SHORTCUT_LIST_SIZE_FIELD_SIZE; + } + + static AK_FORCE_INLINE void skipShortcuts(const ReadOnlyByteArrayView buffer, int *const pos) { + const int shortcutListSize = getShortcutListSizeAndForwardPointer(buffer, pos); + *pos += shortcutListSize; + } + + static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) { + return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; + } + + static int readShortcutTarget(const ReadOnlyByteArrayView buffer, const int maxLength, + int *const outWord, int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListReadingUtils); + + static const ShortcutFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const ShortcutFlags MASK_ATTRIBUTE_PROBABILITY; + static const int SHORTCUT_LIST_SIZE_FIELD_SIZE; + static const int WHITELIST_SHORTCUT_PROBABILITY; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_READING_UTILS_H diff --git a/app/src/main/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h b/app/src/main/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h new file mode 100644 index 000000000..25081fa04 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_POLICY_H +#define LATINIME_BIGRAM_LIST_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + BigramListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} + + ~BigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const pos) const { + BigramListReadWriteUtils::BigramFlags flags; + if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBuffer, &flags, + outBigramPos, pos)) { + AKLOGE("Cannot read bigram entry. bufSize: %zd, pos: %d. ", mBuffer.size(), *pos); + *outProbability = NOT_A_PROBABILITY; + *outHasNext = false; + return; + } + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); + } + + bool skipAllBigrams(int *const pos) const { + return BigramListReadWriteUtils::skipExistingBigrams(mBuffer, pos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy); + + const ReadOnlyByteArrayView mBuffer; +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_POLICY_H diff --git a/app/src/main/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp b/app/src/main/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp new file mode 100644 index 000000000..4e8b96b08 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp @@ -0,0 +1,526 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v2/patricia_trie_policy.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/char_utils.h" + +namespace latinime { + +void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + int nextPos = dicNode->getChildrenPtNodeArrayPos(); + if (!isValidPos(nextPos)) { + AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd", + nextPos, mBuffer.size()); + mIsCorrupted = true; + ASSERT(false); + return; + } + const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mBuffer.data(), &nextPos); + for (int i = 0; i < childCount; i++) { + if (!isValidPos(nextPos)) { + AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d", + nextPos, mBuffer.size(), i, childCount); + mIsCorrupted = true; + ASSERT(false); + return; + } + nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); + } +} + +int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount, + outCodePoints, nullptr /* outUnigramProbability */); +} +// This retrieves code points and the probability of the word by its id. +// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, +// it is possible to check for this with advantageous complexity. For each PtNode array, we search +// for PtNodes with children and compare the children position with the position we look for. +// When we shoot the position we look for, it means the word we look for is in the children +// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a +// PtNode array with the last PtNode's children position still less than what we are searching for, +// we must descend the last PtNode's children (for example, if the word we are searching for starts +// with a z, it's the last PtNode of the root array, so all children addresses will be smaller +// than the position we look for, and we have to descend the z PtNode). +/* Parameters : + * wordId: Id of the word we are searching for. + * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramProbability: a pointer to an int to write the probability into. + * Return value : the code point count, of 0 if the word was not found. + */ +// TODO: Split this function to be more readable +int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int wordId, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + int pos = getRootPosition(); + int wordPos = 0; + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); + if (outUnigramProbability) { + *outUnigramProbability = NOT_A_PROBABILITY; + } + // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will + // only traverse PtNodes that are actually a part of the terminal we are searching, so each + // time we enter this loop we are one depth level further than last time. + // The only reason we count PtNodes is because we want to reduce the probability of infinite + // looping in case there is a bug. Since we know there is an upper bound to the depth we are + // supposed to traverse, it does not hurt to count iterations. + for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { + int lastCandidatePtNodePos = 0; + // Let's loop through PtNodes in this PtNode array searching for either the terminal + // or one of its ascendants. + if (!isValidPos(pos)) { + AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd", + pos, mBuffer.size()); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) { + const int startPos = pos; + if (!isValidPos(pos)) { + AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size()); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos); + const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &pos); + if (ptNodePos == startPos) { + // We found the position. Copy the rest of the code points in the buffer and return + // the length. + outCodePoints[wordPos] = character; + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &pos); + // We count code points in order to avoid infinite loops if the file is broken + // or if there is some other bug + int charCount = maxCodePointCount; + while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &pos); + } + } + if (outUnigramProbability) { + *outUnigramProbability = + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( + mBuffer.data(), &pos); + } + return ++wordPos; + } + // We need to skip past this PtNode, so skip any remaining code points after the + // first and possibly the probability. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH, + codePointTable, &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos); + } + // The fact that this PtNode has children is very important. Since we already know + // that this PtNode does not match, if it has no children we know it is irrelevant + // to what we are searching for. + const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); + // We will write in `found' whether we have passed the children position we are + // searching for. For example if we search for "beer", the children of b are less + // than the address we are searching for and the children of c are greater. When we + // come here for c, we realize this is too big, and that we should descend b. + bool found; + if (hasChildren) { + int currentPos = pos; + // Here comes the tricky part. First, read the children position. + const int childrenPos = PatriciaTrieReadingUtils + ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags, + ¤tPos); + if (childrenPos > ptNodePos) { + // If the children pos is greater than the position, it means the previous + // PtNode, which position is stored in lastCandidatePtNodePos, was the right + // one. + found = true; + } else if (1 >= ptNodeCount) { + // However if we are on the LAST PtNode of this array, and we have NOT shot the + // position we should descend THIS PtNode. So we trick the + // lastCandidatePtNodePos so that we will descend this PtNode, not the previous + // one. + lastCandidatePtNodePos = startPos; + found = true; + } else { + // Else, we should continue looking. + found = false; + } + } else { + // Even if we don't have children here, we could still be on the last PtNode of + // this array. If this is the case, we should descend the last PtNode that had + // children, and their position is already in lastCandidatePtNodePos. + found = (1 >= ptNodeCount); + } + + if (found) { + // Okay, we found the PtNode we should descend. Its position is in + // the lastCandidatePtNodePos variable, so we just re-read it. + if (0 != lastCandidatePtNodePos) { + const PatriciaTrieReadingUtils::NodeFlags lastFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( + mBuffer.data(), &lastCandidatePtNodePos); + const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); + // We copy all the characters in this PtNode to the buffer + outCodePoints[wordPos] = lastChar; + if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); + int charCount = maxCodePointCount; + while (-1 != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); + } + } + ++wordPos; + // Now we only need to branch to the children address. Skip the probability if + // it's there, read pos, and break to resume the search at pos. + if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), + &lastCandidatePtNodePos); + } + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mBuffer.data(), lastFlags, &lastCandidatePtNodePos); + break; + } else { + // Here is a little tricky part: we come here if we found out that all children + // addresses in this PtNode are bigger than the address we are searching for. + // Should we conclude the word is not in the dictionary? No! It could still be + // one of the remaining PtNodes in this array, so we have to keep looking in + // this array until we find it (or we realize it's not there either, in which + // case it's actually not in the dictionary). Pass the end of this PtNode, + // ready to start the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mBuffer.data(), flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + if (!mBigramListPolicy.skipAllBigrams(&pos)) { + AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), + pos); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + } + } + } else { + // If we did not find it, we should record the last children address for the next + // iteration. + if (hasChildren) lastCandidatePtNodePos = startPos; + // Now skip the end of this PtNode (children pos and the attributes if any) so that + // our pos is after the end of this PtNode, at the start of the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mBuffer.data(), flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + if (!mBigramListPolicy.skipAllBigrams(&pos)) { + AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos); + mIsCorrupted = true; + ASSERT(false); + return 0; + } + } + } + + } + } + // If we have looked through all the PtNodes and found no match, the ptNodePos is + // not the position of a terminal in this dictionary. + return 0; +} + +// This function gets the position of the terminal PtNode of the exact matching word in the +// dictionary. If no match is found, it returns NOT_A_WORD_ID. +int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getWordId()."); + } + return getWordIdFromTerminalPtNodePos(ptNodePos); +} + +const WordAttributes PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (multiBigramMap) { + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); + } + if (!prevWordIds.empty()) { + const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId); + if (bigramProbability != NOT_A_PROBABILITY) { + return getWordAttributes(bigramProbability, ptNodeParams); + } + } + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.isPossiblyOffensive()); +} + +int PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + // Due to space constraints, the probability for bigrams is approximate - the lower the unigram + // probability, the worse the precision. The theoritical maximum error in resulting probability + // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means + // that sometimes, we'll see some bigrams interverted here, but it can't get too bad. + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } +} + +int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID) { + return NOT_A_PROBABILITY; + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.isNotAWord()) { + // If this is not a word, it should behave as having no probability outside of the + // suggestion process (where it should be used for shortcuts). + return NOT_A_PROBABILITY; + } + if (!prevWordIds.empty()) { + const int bigramsPosition = getBigramsPositionOfPtNode( + getTerminalPtNodePosFromWordId(prevWordIds[0])); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == ptNodePos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { + return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); + } + } + return NOT_A_PROBABILITY; + } + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); +} + +void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const { + if (prevWordIds.empty()) { + return; + } + const int bigramsPosition = getBigramsPositionOfPtNode( + getTerminalPtNodePosFromWordId(prevWordIds[0])); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + listener->onVisitEntry(bigramsIt.getProbability(), + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); + } +} + +BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const { + const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); + return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos); +} + +int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos(); +} + +int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos(); +} + +int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, + const int ptNodePos, DicNodeVector *childDicNodes) const { + PatriciaTrieReadingUtils::NodeFlags flags; + int mergedNodeCodePointCount = 0; + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + int probability = NOT_A_PROBABILITY; + int childrenPos = NOT_A_DICT_POS; + int shortcutPos = NOT_A_DICT_POS; + int bigramPos = NOT_A_DICT_POS; + int siblingPos = NOT_A_DICT_POS; + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); + PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy, + &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount, + mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, + &siblingPos); + // Skip PtNodes don't start with Unicode code point because they represent non-word information. + if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { + const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId, + CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount)); + } + return siblingPos; +} + +const WordProperty PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("getWordProperty was called for invalid word."); + return WordProperty(); + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + // Fetch bigram information. + std::vector ngrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos); + while (bigramsIt.hasNext()) { + // Fetch the next bigram information and forward the iterator. + bigramsIt.next(); + // Skip the entry if the entry has been deleted. This never happens for ver2 dicts. + if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { + int word1Probability = NOT_A_PROBABILITY; + const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH, + bigramWord1CodePoints, &word1Probability); + const int probability = getProbability(word1Probability, bigramsIt.getProbability()); + ngrams.emplace_back( + NgramContext(wordCodePoints.data(), wordCodePoints.size(), + ptNodeParams.representsBeginningOfSentence()), + CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(), + probability, HistoricalInfo()); + } + } + // Fetch shortcut information. + std::vector shortcuts; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTargetCodePoints[MAX_WORD_LENGTH]; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos); + bool hasNext = true; + while (hasNext) { + const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos); + hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); + const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( + mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); + const int shortcutProbability = + ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); + shortcuts.emplace_back( + CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(), + shortcutProbability); + } + } + const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), + ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), + ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); +} + +int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; + if (token == 0) { + // Start iterating the dictionary. + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + *outCodePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { + return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; +} + +int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { + return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; +} + +bool PatriciaTriePolicy::isValidPos(const int pos) const { + return pos >= 0 && pos < static_cast(mBuffer.size()); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v2/patricia_trie_policy.h b/app/src/main/jni/src/dictionary/structure/v2/patricia_trie_policy.h new file mode 100644 index 000000000..8edfa7d10 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/patricia_trie_policy.h @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_POLICY_H +#define LATINIME_PATRICIA_TRIE_POLICY_H + +#include +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/v2/bigram/bigram_list_policy.h" +#include "dictionary/structure/v2/shortcut/shortcut_list_policy.h" +#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h" +#include "dictionary/structure/v2/ver2_pt_node_array_reader.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "utils/byte_array_view.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +// Word id = Position of a PtNode that represents the word. +// Max supported n-gram is bigram. +class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) + : mMmappedBuffer(std::move(mmappedBuffer)), + mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), + FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())), + mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())), + mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer), + mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy, + mHeaderPolicy.getCodePointTable()), + mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(), + mIsCorrupted(false) {} + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; + + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; + + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; + + void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const; + + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + + bool removeUnigramEntry(const CodePointArrayView wordCodePoints) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + + bool addNgramEntry(const NgramProperty *const ngramProperty) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + + bool flush(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flush() is called for non-updatable dictionary."); + return false; + } + + bool flushWithGC(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + + bool needsToRunGC(const bool mindsBlockByGC) const { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength) { + // getProperty is not supported for this class. + if (maxResultLength > 0) { + outResult[0] = '\0'; + } + } + + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); + + const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; + const HeaderPolicy mHeaderPolicy; + const ReadOnlyByteArrayView mBuffer; + const BigramListPolicy mBigramListPolicy; + const ShortcutListPolicy mShortcutListPolicy; + const Ver2ParticiaTrieNodeReader mPtNodeReader; + const Ver2PtNodeArrayReader mPtNodeArrayReader; + std::vector mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; + + int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; + int getBigramsPositionOfPtNode(const int ptNodePos) const; + int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, + DicNodeVector *const childDicNodes) const; + int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; + int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; + bool isValidPos(const int pos) const; +}; +} // namespace latinime +#endif // LATINIME_PATRICIA_TRIE_POLICY_H diff --git a/app/src/main/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h b/app/src/main/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h new file mode 100644 index 000000000..995b1ed01 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_POLICY_H +#define LATINIME_SHORTCUT_LIST_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit ShortcutListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} + + ~ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int listPos = pos; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &listPos); + return listPos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + mBuffer, maxCodePointCount, outCodePoint, pos); + } + } + + void skipAllShortcuts(int *const pos) const { + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(mBuffer, pos); + *pos += shortcutListSize; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy); + + const ReadOnlyByteArrayView mBuffer; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_POLICY_H diff --git a/app/src/main/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/app/src/main/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..cbb8ead81 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h" + +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" + +namespace latinime { + +const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNodePos( + const int ptNodePos) const { + if (ptNodePos < 0 || ptNodePos >= static_cast(mBuffer.size())) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %zd", + ptNodePos, mBuffer.size()); + ASSERT(false); + return PtNodeParams(); + } + PatriciaTrieReadingUtils::NodeFlags flags; + int mergedNodeCodePointCount = 0; + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + int probability = NOT_A_PROBABILITY; + int childrenPos = NOT_A_DICT_POS; + int shortcutPos = NOT_A_DICT_POS; + int bigramPos = NOT_A_DICT_POS; + int siblingPos = NOT_A_DICT_POS; + PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortcutPolicy, + mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, + &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); + if (mergedNodeCodePointCount <= 0) { + AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); + ASSERT(false); + return PtNodeParams(); + } + return PtNodeParams(ptNodePos, flags, mergedNodeCodePointCount, mergedNodeCodePoints, + probability, childrenPos, shortcutPos, bigramPos, siblingPos); +} + +} diff --git a/app/src/main/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/app/src/main/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h new file mode 100644 index 000000000..dc87c7c68 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +class Ver2ParticiaTrieNodeReader : public PtNodeReader { + public: + Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer, + const DictionaryBigramsStructurePolicy *const bigramPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const int *const codePointTable) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), + mCodePointTable(codePointTable) {} + + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader); + + const ReadOnlyByteArrayView mBuffer; + const DictionaryBigramsStructurePolicy *const mBigramPolicy; + const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; + const int *const mCodePointTable; +}; +} // namespace latinime +#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp b/app/src/main/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp new file mode 100644 index 000000000..8b9b02df1 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v2/ver2_pt_node_array_reader.h" + +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" + +namespace latinime { + +bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= static_cast(mBuffer.size())) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %zd", + ptNodeArrayPos, mBuffer.size()); + ASSERT(false); + return false; + } + int readingPos = ptNodeArrayPos; + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mBuffer.data(), &readingPos); + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver2PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= static_cast(mBuffer.size())) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %zd", + forwordLinkPos, mBuffer.size()); + ASSERT(false); + return false; + } + // Ver2 dicts don't have forward links. + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h b/app/src/main/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h new file mode 100644 index 000000000..32fa96d15 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER2_PT_NODE_ARRAY_READER_H +#define LATINIME_VER2_PT_NODE_ARRAY_READER_H + +#include + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class Ver2PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver2PtNodeArrayReader(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver2PtNodeArrayReader); + + const ReadOnlyByteArrayView mBuffer; +}; +} // namespace latinime +#endif /* LATINIME_VER2_PT_NODE_ARRAY_READER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/app/src/main/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp new file mode 100644 index 000000000..165947f87 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" + +namespace latinime { + +// Used to provide stable probabilities even if the user's input count is small. +const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2, 1}; + +// Encoded backoff weights. +// Note that we give positive values for trigrams and quadgrams that means the weight is more than +// 1. +// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight. +const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, -4, 2, 8}; + +// This value is used to remove too old entries from the dictionary. +const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS = + 300 * 24 * 60 * 60; // 300 days + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/app/src/main/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h new file mode 100644 index 000000000..71824c954 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H +#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "utils/ngram_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class DynamicLanguageModelProbabilityUtils { + public: + static float computeRawProbabilityFromCounts(const int count, const int contextCount, + const NgramType ngramType) { + const int minCount = ASSUMED_MIN_COUNTS[static_cast(ngramType)]; + return static_cast(count) / static_cast(std::max(contextCount, minCount)); + } + + static float backoff(const int ngramProbability, const NgramType ngramType) { + const int probability = + ngramProbability + ENCODED_BACKOFF_WEIGHTS[static_cast(ngramType)]; + return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY); + } + + static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) { + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + if (elapsedTime < 0) { + AKLOGE("The elapsed time is negatime value. Timestamp overflow?"); + return NOT_A_PROBABILITY; + } + // TODO: Improve this logic. + // We don't modify probability depending on the elapsed time. + return probability; + } + + static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS; + } + + static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + // More recently input entries get higher priority. + return historicalInfo.getTimestamp(); + } + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils); + + static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 3, "Max supported Ngram is Quadgram."); + + static const int ASSUMED_MIN_COUNTS[]; + static const int ENCODED_BACKOFF_WEIGHTS[]; + static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS; +}; + +} // namespace latinime +#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp new file mode 100644 index 000000000..c10e4906b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -0,0 +1,478 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content.h" + +#include +#include + +#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0; +const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1; + +bool LanguageModelDictContent::save(FILE *const file) const { + return mTrieMap.save(file) && mGlobalCounters.save(file); +} + +bool LanguageModelDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const LanguageModelDictContent *const originalContent) { + return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(), + 0 /* nextLevelBitmapEntryIndex */); +} + +const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, + const int wordId, const bool mustMatchAllPrevWords, + const HeaderPolicy *const headerPolicy) const { + int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; + bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); + int maxPrevWordCount = 0; + for (size_t i = 0; i < prevWordIds.size(); ++i) { + const int nextBitmapEntryIndex = + mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex; + if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) { + break; + } + maxPrevWordCount = i + 1; + bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; + } + + const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); + if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) { + // The word should be treated as a invalid word. + return WordAttributes(); + } + for (int i = maxPrevWordCount; i >= 0; --i) { + if (mustMatchAllPrevWords && prevWordIds.size() > static_cast(i)) { + break; + } + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); + if (!result.mIsValid) { + continue; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); + int probability = NOT_A_PROBABILITY; + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + int contextCount = 0; + if (i == 0) { + // unigram + contextCount = mGlobalCounters.getTotalCount(); + } else { + const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( + prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); + if (!prevWordProbabilityEntry.isValid()) { + continue; + } + if (prevWordProbabilityEntry.representsBeginningOfSentence() + && historicalInfo->getCount() == 1) { + // BoS ngram requires multiple contextCount. + continue; + } + contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount(); + } + const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1); + const float rawProbability = + DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts( + historicalInfo->getCount(), contextCount, ngramType); + const int encodedRawProbability = + ProbabilityUtils::encodeRawProbability(rawProbability); + const int decayedProbability = + DynamicLanguageModelProbabilityUtils::getDecayedProbability( + encodedRawProbability, *historicalInfo); + probability = DynamicLanguageModelProbabilityUtils::backoff( + decayedProbability, ngramType); + } else { + probability = probabilityEntry.getProbability(); + } + // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in + // probabilityEntry. + return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), + unigramProbabilityEntry.isNotAWord(), + unigramProbabilityEntry.isPossiblyOffensive()); + } + // Cannot find the word. + return WordAttributes(); +} + +ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( + const WordIdArrayView prevWordIds, const int wordId) const { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + return ProbabilityEntry(); + } + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); + if (!result.mIsValid) { + // Not found. + return ProbabilityEntry(); + } + return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); +} + +bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId, const ProbabilityEntry *const probabilityEntry) { + if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) { + return false; + } + const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + return false; + } + return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); +} + +bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId) { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + // Cannot find bitmap entry for the probability entry. The entry doesn't exist. + return false; + } + return mTrieMap.remove(wordId, bitmapEntryIndex); +} + +LanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries( + const WordIdArrayView prevWordIds) const { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo); +} + +std::vector + LanguageModelDictContent::exportAllNgramEntriesRelatedToWord( + const HeaderPolicy *const headerPolicy, const int wordId) const { + const TrieMap::Result result = mTrieMap.getRoot(wordId); + if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) { + // The word doesn't have any related ngram entries. + return std::vector(); + } + std::vector prevWordIds = { wordId }; + std::vector entries; + exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex, + &prevWordIds, &entries); + return entries; +} + +void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner( + const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex, + std::vector *const prevWordIds, + std::vector *const outBummpedFullEntryInfo) const { + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + const int wordId = entry.key(); + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + if (probabilityEntry.isValid()) { + const WordAttributes wordAttributes = getWordAttributes( + WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */, + headerPolicy); + outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId, + wordAttributes, probabilityEntry); + } + if (entry.hasNextLevelMap()) { + prevWordIds->push_back(wordId); + exportAllNgramEntriesRelatedToWordInner(headerPolicy, + entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo); + prevWordIds->pop_back(); + } + } +} + +bool LanguageModelDictContent::truncateEntries(const EntryCounts ¤tEntryCounts, + const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const outEntryCounters) { + for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) { + const int totalWordCount = prevWordCount + 1; + const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount); + if (currentEntryCounts.getNgramCount(ngramType) + <= maxEntryCounts.getNgramCount(ngramType)) { + outEntryCounters->setNgramCount(ngramType, + currentEntryCounts.getNgramCount(ngramType)); + continue; + } + int entryCount = 0; + if (!turncateEntriesInSpecifiedLevel(headerPolicy, + maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) { + return false; + } + outEntryCounters->setNgramCount(ngramType, entryCount); + } + return true; +} + +bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, + const int wordId, const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) { + if (!mHasHistoricalInfo) { + AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info."); + return false; + } + const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId); + const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom( + originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy); + if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) { + return false; + } + mGlobalCounters.incrementTotalCount(); + mGlobalCounters.updateMaxValueOfCounters( + updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount()); + for (size_t i = 0; i < prevWordIds.size(); ++i) { + if (prevWordIds[i] == NOT_A_WORD_ID) { + break; + } + // TODO: Optimize this code. + const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1); + const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry( + limitedPrevWordIds, wordId); + const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom( + originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy); + if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) { + return false; + } + mGlobalCounters.updateMaxValueOfCounters( + updatedNgramProbabilityEntry.getHistoricalInfo()->getCount()); + if (!originalNgramProbabilityEntry.isValid()) { + // (i + 2) words are used in total because the prevWords consists of (i + 1) words when + // looking at its i-th element. + entryCountersToUpdate->incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(i + 2)); + } + } + return true; +} + +const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( + const ProbabilityEntry &originalProbabilityEntry, const bool isValid, + const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { + const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(), + 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount() + + historicalInfo.getCount()); + if (originalProbabilityEntry.isValid()) { + return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); + } else { + return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo); + } +} + +bool LanguageModelDictContent::runGCInner( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) { + for (auto &entry : trieMapRange) { + const auto it = terminalIdMap->find(entry.key()); + if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) { + // The word has been removed. + continue; + } + if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) { + return false; + } + if (entry.hasNextLevelMap()) { + if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(), + mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) { + return false; + } + } + } + return true; +} + +int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) { + int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); + for (const int wordId : prevWordIds) { + const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex); + if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) { + lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + continue; + } + if (!result.mIsValid) { + if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo), + lastBitmapEntryIndex)) { + AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId, + lastBitmapEntryIndex); + return TrieMap::INVALID_INDEX; + } + } + lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId, + lastBitmapEntryIndex); + } + return lastBitmapEntryIndex; +} + +int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const { + int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); + for (const int wordId : prevWordIds) { + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); + if (!result.mIsValid) { + return TrieMap::INVALID_INDEX; + } + bitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + } + return bitmapEntryIndex; +} + +bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, + const int prevWordCount, const HeaderPolicy *const headerPolicy, + const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { + AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", + prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM); + return false; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + if (prevWordCount > 0 && probabilityEntry.isValid() + && !mTrieMap.getRoot(entry.key()).mIsValid) { + // The entry is related to a word that has been removed. Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + if (mHasHistoricalInfo && probabilityEntry.isValid()) { + const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); + if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( + *originalHistoricalInfo)) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + if (needsToHalveCounters) { + const int updatedCount = originalHistoricalInfo->getCount() / 2; + if (updatedCount == 0) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), + originalHistoricalInfo->getLevel(), updatedCount); + const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), + &historicalInfoToSave); + if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), + bitmapEntryIndex)) { + return false; + } + } + } + outEntryCounters->incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1)); + if (!entry.hasNextLevelMap()) { + continue; + } + if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), + prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { + return false; + } + } + return true; +} + +bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel( + const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, + int *const outEntryCount) { + std::vector prevWordIds; + std::vector entryInfoVector; + if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(), + &prevWordIds, &entryInfoVector)) { + return false; + } + if (static_cast(entryInfoVector.size()) <= maxEntryCount) { + *outEntryCount = static_cast(entryInfoVector.size()); + return true; + } + *outEntryCount = maxEntryCount; + const int entryCountToRemove = static_cast(entryInfoVector.size()) - maxEntryCount; + std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove, + entryInfoVector.end(), + EntryInfoToTurncate::Comparator()); + for (int i = 0; i < entryCountToRemove; ++i) { + const EntryInfoToTurncate &entryInfo = entryInfoVector[i]; + if (!removeNgramProbabilityEntry( + WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount), + entryInfo.mKey)) { + return false; + } + } + return true; +} + +bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy, + const int targetLevel, const int bitmapEntryIndex, std::vector *const prevWordIds, + std::vector *const outEntryInfo) const { + const int prevWordCount = prevWordIds->size(); + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + if (prevWordCount < targetLevel) { + if (!entry.hasNextLevelMap()) { + continue; + } + prevWordIds->push_back(entry.key()); + if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(), + prevWordIds, outEntryInfo)) { + return false; + } + prevWordIds->pop_back(); + continue; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + const int priority = mHasHistoricalInfo + ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction( + *probabilityEntry.getHistoricalInfo()) + : probabilityEntry.getProbability(); + outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(), + entry.key(), targetLevel, prevWordIds->data()); + } + return true; +} + +bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( + const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { + if (left.mPriority != right.mPriority) { + return left.mPriority < right.mPriority; + } + if (left.mCount != right.mCount) { + return left.mCount < right.mCount; + } + if (left.mKey != right.mKey) { + return left.mKey < right.mKey; + } + if (left.mPrevWordCount != right.mPrevWordCount) { + return left.mPrevWordCount > right.mPrevWordCount; + } + for (int i = 0; i < left.mPrevWordCount; ++i) { + if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) { + return left.mPrevWordIds[i] < right.mPrevWordIds[i]; + } + } + // left and rigth represent the same entry. + return false; +} + +LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority, + const int count, const int key, const int prevWordCount, const int *const prevWordIds) + : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) { + memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content.h b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content.h new file mode 100644 index 000000000..db8c6e12b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content.h @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H +#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H + +#include +#include + +#include "defines.h" +#include "dictionary/property/word_attributes.h" +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/trie_map.h" +#include "utils/byte_array_view.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class HeaderPolicy; + +/** + * Class representing language model. + * + * This class provides methods to get and store unigram/n-gram probability information and flags. + */ +class LanguageModelDictContent { + public: + // Pair of word id and probability entry used for iteration. + class WordIdAndProbabilityEntry { + public: + WordIdAndProbabilityEntry(const int wordId, const ProbabilityEntry &probabilityEntry) + : mWordId(wordId), mProbabilityEntry(probabilityEntry) {} + + int getWordId() const { return mWordId; } + const ProbabilityEntry getProbabilityEntry() const { return mProbabilityEntry; } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(WordIdAndProbabilityEntry); + DISALLOW_ASSIGNMENT_OPERATOR(WordIdAndProbabilityEntry); + + const int mWordId; + const ProbabilityEntry mProbabilityEntry; + }; + + // Iterator. + class EntryIterator { + public: + EntryIterator(const TrieMap::TrieMapIterator &trieMapIterator, + const bool hasHistoricalInfo) + : mTrieMapIterator(trieMapIterator), mHasHistoricalInfo(hasHistoricalInfo) {} + + const WordIdAndProbabilityEntry operator*() const { + const TrieMap::TrieMapIterator::IterationResult &result = *mTrieMapIterator; + return WordIdAndProbabilityEntry( + result.key(), ProbabilityEntry::decode(result.value(), mHasHistoricalInfo)); + } + + bool operator!=(const EntryIterator &other) const { + return mTrieMapIterator != other.mTrieMapIterator; + } + + const EntryIterator &operator++() { + ++mTrieMapIterator; + return *this; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryIterator); + DISALLOW_ASSIGNMENT_OPERATOR(EntryIterator); + + TrieMap::TrieMapIterator mTrieMapIterator; + const bool mHasHistoricalInfo; + }; + + // Class represents range to use range base for loops. + class EntryRange { + public: + EntryRange(const TrieMap::TrieMapRange trieMapRange, const bool hasHistoricalInfo) + : mTrieMapRange(trieMapRange), mHasHistoricalInfo(hasHistoricalInfo) {} + + EntryIterator begin() const { + return EntryIterator(mTrieMapRange.begin(), mHasHistoricalInfo); + } + + EntryIterator end() const { + return EntryIterator(mTrieMapRange.end(), mHasHistoricalInfo); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryRange); + DISALLOW_ASSIGNMENT_OPERATOR(EntryRange); + + const TrieMap::TrieMapRange mTrieMapRange; + const bool mHasHistoricalInfo; + }; + + class DumppedFullEntryInfo { + public: + DumppedFullEntryInfo(std::vector &prevWordIds, const int targetWordId, + const WordAttributes &wordAttributes, const ProbabilityEntry &probabilityEntry) + : mPrevWordIds(prevWordIds), mTargetWordId(targetWordId), + mWordAttributes(wordAttributes), mProbabilityEntry(probabilityEntry) {} + + const WordIdArrayView getPrevWordIds() const { return WordIdArrayView(mPrevWordIds); } + int getTargetWordId() const { return mTargetWordId; } + const WordAttributes &getWordAttributes() const { return mWordAttributes; } + const ProbabilityEntry &getProbabilityEntry() const { return mProbabilityEntry; } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(DumppedFullEntryInfo); + + const std::vector mPrevWordIds; + const int mTargetWordId; + const WordAttributes mWordAttributes; + const ProbabilityEntry mProbabilityEntry; + }; + + LanguageModelDictContent(const ReadWriteByteArrayView *const buffers, + const bool hasHistoricalInfo) + : mTrieMap(buffers[TRIE_MAP_BUFFER_INDEX]), + mGlobalCounters(buffers[GLOBAL_COUNTERS_BUFFER_INDEX]), + mHasHistoricalInfo(hasHistoricalInfo) {} + + explicit LanguageModelDictContent(const bool hasHistoricalInfo) + : mTrieMap(), mGlobalCounters(), mHasHistoricalInfo(hasHistoricalInfo) {} + + bool isNearSizeLimit() const { + return mTrieMap.isNearSizeLimit() || mGlobalCounters.needsToHalveCounters(); + } + + bool save(FILE *const file) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const LanguageModelDictContent *const originalContent); + + const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, + const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const; + + ProbabilityEntry getProbabilityEntry(const int wordId) const { + return getNgramProbabilityEntry(WordIdArrayView(), wordId); + } + + bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { + mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount()); + return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); + } + + bool removeProbabilityEntry(const int wordId) { + return removeNgramProbabilityEntry(WordIdArrayView(), wordId); + } + + ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId) const; + + bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId, + const ProbabilityEntry *const probabilityEntry); + + bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId); + + EntryRange getProbabilityEntries(const WordIdArrayView prevWordIds) const; + + std::vector exportAllNgramEntriesRelatedToWord( + const HeaderPolicy *const headerPolicy, const int wordId) const; + + bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const outEntryCounters) { + if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), + 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(), + outEntryCounters)) { + return false; + } + if (mGlobalCounters.needsToHalveCounters()) { + mGlobalCounters.halveCounters(); + } + return true; + } + + // entryCounts should be created by updateAllProbabilityEntries. + bool truncateEntries(const EntryCounts ¤tEntryCounts, const EntryCounts &maxEntryCounts, + const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters); + + bool updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId, + const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const entryCountersToUpdate); + + private: + DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); + + class EntryInfoToTurncate { + public: + class Comparator { + public: + bool operator()(const EntryInfoToTurncate &left, + const EntryInfoToTurncate &right) const; + private: + DISALLOW_ASSIGNMENT_OPERATOR(Comparator); + }; + + EntryInfoToTurncate(const int priority, const int count, const int key, + const int prevWordCount, const int *const prevWordIds); + + int mPriority; + // TODO: Remove. + int mCount; + int mKey; + int mPrevWordCount; + int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate); + }; + + static const int TRIE_MAP_BUFFER_INDEX; + static const int GLOBAL_COUNTERS_BUFFER_INDEX; + + TrieMap mTrieMap; + LanguageModelDictContentGlobalCounters mGlobalCounters; + const bool mHasHistoricalInfo; + + bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex); + int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds); + int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; + bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, + const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters, + MutableEntryCounters *const outEntryCounters); + bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy, + const int maxEntryCount, const int targetLevel, int *const outEntryCount); + bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel, + const int bitmapEntryIndex, std::vector *const prevWordIds, + std::vector *const outEntryInfo) const; + const ProbabilityEntry createUpdatedEntryFrom(const ProbabilityEntry &originalProbabilityEntry, + const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy) const; + void exportAllNgramEntriesRelatedToWordInner(const HeaderPolicy *const headerPolicy, + const int bitmapEntryIndex, std::vector *const prevWordIds, + std::vector *const outBummpedFullEntryInfo) const; +}; +} // namespace latinime +#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp new file mode 100644 index 000000000..89cf0e306 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" + +#include + +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +const int LanguageModelDictContentGlobalCounters::COUNTER_VALUE_NEAR_LIMIT_THRESHOLD = + (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 64; +const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD = 1 << 30; +const int LanguageModelDictContentGlobalCounters::COUNTER_SIZE_IN_BYTES = 4; +const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_INDEX = 0; +const int LanguageModelDictContentGlobalCounters::MAX_VALUE_OF_COUNTERS_INDEX = 1; + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h new file mode 100644 index 000000000..3f87c0ea0 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H +#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H + +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class LanguageModelDictContentGlobalCounters { + public: + explicit LanguageModelDictContentGlobalCounters(const ReadWriteByteArrayView buffer) + : mBuffer(buffer, 0 /* maxAdditionalBufferSize */), + mTotalCount(readValue(mBuffer, TOTAL_COUNT_INDEX)), + mMaxValueOfCounters(readValue(mBuffer, MAX_VALUE_OF_COUNTERS_INDEX)) {} + + LanguageModelDictContentGlobalCounters() + : mBuffer(0 /* maxAdditionalBufferSize */), mTotalCount(0), mMaxValueOfCounters(0) {} + + bool needsToHalveCounters() const { + return mMaxValueOfCounters >= COUNTER_VALUE_NEAR_LIMIT_THRESHOLD + || mTotalCount >= TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; + } + + int getTotalCount() const { + return mTotalCount; + } + + bool save(FILE *const file) const { + BufferWithExtendableBuffer bufferToWrite( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!bufferToWrite.writeUint(mTotalCount, COUNTER_SIZE_IN_BYTES, + TOTAL_COUNT_INDEX * COUNTER_SIZE_IN_BYTES)) { + return false; + } + if (!bufferToWrite.writeUint(mMaxValueOfCounters, COUNTER_SIZE_IN_BYTES, + MAX_VALUE_OF_COUNTERS_INDEX * COUNTER_SIZE_IN_BYTES)) { + return false; + } + return DictFileWritingUtils::writeBufferToFileTail(file, &bufferToWrite); + } + + void incrementTotalCount() { + mTotalCount += 1; + } + + void addToTotalCount(const int count) { + mTotalCount += count; + } + + void updateMaxValueOfCounters(const int count) { + mMaxValueOfCounters = std::max(count, mMaxValueOfCounters); + } + + void halveCounters() { + mMaxValueOfCounters /= 2; + mTotalCount /= 2; + } + +private: + DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContentGlobalCounters); + + const static int COUNTER_VALUE_NEAR_LIMIT_THRESHOLD; + const static int TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; + const static int COUNTER_SIZE_IN_BYTES; + const static int TOTAL_COUNT_INDEX; + const static int MAX_VALUE_OF_COUNTERS_INDEX; + + BufferWithExtendableBuffer mBuffer; + int mTotalCount; + int mMaxValueOfCounters; + + static int readValue(const BufferWithExtendableBuffer &buffer, const int index) { + const int pos = COUNTER_SIZE_IN_BYTES * index; + if (pos + COUNTER_SIZE_IN_BYTES > buffer.getTailPosition()) { + return 0; + } + return buffer.readUint(COUNTER_SIZE_IN_BYTES, pos); + } +}; +} // namespace latinime +#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/probability_entry.h b/app/src/main/jni/src/dictionary/structure/v4/content/probability_entry.h new file mode 100644 index 000000000..473354b90 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/probability_entry.h @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_ENTRY_H +#define LATINIME_PROBABILITY_ENTRY_H + +#include +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +class ProbabilityEntry { + public: + ProbabilityEntry(const ProbabilityEntry &probabilityEntry) + : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), + mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} + + // Dummy entry + ProbabilityEntry() + : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY), + mHistoricalInfo() {} + + // Entry without historical information + ProbabilityEntry(const int flags, const int probability) + : mFlags(flags), mProbability(probability), mHistoricalInfo() {} + + // Entry with historical information. + ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo) + : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {} + + // Create from unigram property. + ProbabilityEntry(const UnigramProperty *const unigramProperty) + : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(), + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + unigramProperty->isPossiblyOffensive())), + mProbability(unigramProperty->getProbability()), + mHistoricalInfo(unigramProperty->getHistoricalInfo()) {} + + // Create from ngram property. + // TODO: Set flags. + ProbabilityEntry(const NgramProperty *const ngramProperty) + : mFlags(0), mProbability(ngramProperty->getProbability()), + mHistoricalInfo(ngramProperty->getHistoricalInfo()) {} + + bool isValid() const { + return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0; + } + + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + + uint8_t getFlags() const { + return mFlags; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo *getHistoricalInfo() const { + return &mHistoricalInfo; + } + + bool representsBeginningOfSentence() const { + return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0; + } + + bool isNotAWord() const { + return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0; + } + + bool isBlacklisted() const { + return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0; + } + + bool isPossiblyOffensive() const { + return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0; + } + + uint64_t encode(const bool hasHistoricalInfo) const { + uint64_t encodedEntry = static_cast(mFlags); + if (hasHistoricalInfo) { + encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT)) + | static_cast(mHistoricalInfo.getTimestamp()); + encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT)) + | static_cast(mHistoricalInfo.getLevel()); + encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) + | static_cast(mHistoricalInfo.getCount()); + } else { + encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT)) + | static_cast(mProbability); + } + return encodedEntry; + } + + static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) { + if (hasHistoricalInfo) { + const int flags = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, + Ver4DictConstants::TIME_STAMP_FIELD_SIZE + + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); + const int timestamp = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::TIME_STAMP_FIELD_SIZE, + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); + const int level = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); + const int count = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */); + const HistoricalInfo historicalInfo(timestamp, level, count); + return ProbabilityEntry(flags, &historicalInfo); + } else { + const int flags = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, + Ver4DictConstants::PROBABILITY_SIZE); + const int probability = readFromEncodedEntry(encodedEntry, + Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */); + return ProbabilityEntry(flags, probability); + } + } + + private: + // Copy constructor is public to use this class as a type of return value. + DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); + + const uint8_t mFlags; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + + static int readFromEncodedEntry(const uint64_t encodedEntry, const int size, const int pos) { + return static_cast( + (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1)); + } + + static uint8_t createFlags(const bool representsBeginningOfSentence, + const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) { + uint8_t flags = 0; + if (representsBeginningOfSentence) { + flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + } + if (isNotAWord) { + flags |= Ver4DictConstants::FLAG_NOT_A_WORD; + } + if (isBlacklisted) { + flags |= Ver4DictConstants::FLAG_BLACKLISTED; + } + if (isPossiblyOffensive) { + flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE; + } + return flags; + } +}; +} // namespace latinime +#endif /* LATINIME_PROBABILITY_ENTRY_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp b/app/src/main/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp new file mode 100644 index 000000000..e3b419449 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/shortcut_dict_content.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const { + const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); + if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { + AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", + *shortcutEntryPos, shortcutListBuffer->getTailPosition()); + ASSERT(false); + if (outhasNext) { + *outhasNext = false; + } + if (outCodePointCount) { + *outCodePointCount = 0; + } + return; + } + + const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + if (outProbability) { + *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; + } + if (outhasNext) { + *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + } + if (outCodePoint && outCodePointCount) { + shortcutListBuffer->readCodePointsAndAdvancePosition( + maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); + } +} + +int ShortcutDictContent::getShortcutListHeadPos(const int terminalId) const { + const SparseTable *const addressLookupTable = getAddressLookupTable(); + if (!addressLookupTable->contains(terminalId)) { + return NOT_A_DICT_POS; + } + return addressLookupTable->get(terminalId); +} + +bool ShortcutDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalShortcutListPos = + originalShortcutDictContent->getShortcutListHeadPos(it->first); + if (originalShortcutListPos == NOT_A_DICT_POS) { + continue; + } + const int shortcutListPos = getContentBuffer()->getTailPosition(); + // Copy shortcut list from original content. + if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, + shortcutListPos)) { + AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", + originalShortcutListPos, shortcutListPos); + return false; + } + // Set shortcut list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { + AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", + it->second, shortcutListPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::createNewShortcutList(const int terminalId) { + const int shortcutListListPos = getContentBuffer()->getTailPosition(); + return getUpdatableAddressLookupTable()->set(terminalId, shortcutListListPos); +} + +bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const int toPos) { + return copyShortcutListFromDictContent(shortcutListPos, this, toPos); +} + +bool ShortcutDictContent::copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { + bool hasNext = true; + int readingPos = shortcutListPos; + int writingPos = toPos; + int codePoints[MAX_WORD_LENGTH]; + while (hasNext) { + int probability = 0; + int codePointCount = 0; + sourceShortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, + codePoints, &codePointCount, &probability, &hasNext, &readingPos); + if (!writeShortcutEntryAndAdvancePosition(codePoints, codePointCount, probability, + hasNext, &writingPos)) { + AKLOGE("Cannot write shortcut entry to copy. pos: %d", writingPos); + return false; + } + } + return true; +} + +bool ShortcutDictContent::setProbability(const int probability, const int shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = shortcutListBuffer->readUint( + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); + const bool hasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; + const int shortcutFlagsToWrite = createAndGetShortcutFlags(probability, hasNext); + return shortcutListBuffer->writeUint(shortcutFlagsToWrite, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); +} + +bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos) { + BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); + const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); + if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, + Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); + return false; + } + if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, + true /* writesTerminator */, shortcutEntryPos)) { + AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); + return false; + } + return true; +} + +// Find a shortcut entry that has specified target and return its position. +int ShortcutDictContent::findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const { + bool hasNext = true; + int readingPos = shortcutListPos; + int targetCodePoints[MAX_WORD_LENGTH]; + while (hasNext) { + const int entryPos = readingPos; + int probability = 0; + int targetCodePointCount = 0; + getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, targetCodePoints, &targetCodePointCount, + &probability, &hasNext, &readingPos); + if (targetCodePointCount != codePointCount) { + continue; + } + bool matched = true; + for (int i = 0; i < codePointCount; ++i) { + if (targetCodePointsToFind[i] != targetCodePoints[i]) { + matched = false; + break; + } + } + if (matched) { + return entryPos; + } + } + return NOT_A_DICT_POS; +} + +int ShortcutDictContent::createAndGetShortcutFlags(const int probability, + const bool hasNext) const { + return (probability & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK) + | (hasNext ? Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK : 0); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h b/app/src/main/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h new file mode 100644 index 000000000..27de4e79e --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_DICT_CONTENT_H +#define LATINIME_SHORTCUT_DICT_CONTENT_H + +#include + +#include "defines.h" +#include "dictionary/structure/v4/content/sparse_table_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +class ReadWriteByteArrayView; + +class ShortcutDictContent : public SparseTableDictContent { + public: + ShortcutDictContent(const ReadWriteByteArrayView *const buffers) + : SparseTableDictContent(buffers, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + ShortcutDictContent() + : SparseTableDictContent(Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} + + void getShortcutEntry(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, int *const outProbability, bool *const outhasNext, + const int shortcutEntryPos) { + int readingPos = shortcutEntryPos; + return getShortcutEntryAndAdvancePosition(maxCodePointCount, outCodePoint, + outCodePointCount, outProbability, outhasNext, &readingPos); + } + + void getShortcutEntryAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoint, int *const outCodePointCount, int *const outProbability, + bool *const outhasNext, int *const shortcutEntryPos) const; + + // Returns head position of shortcut list for a PtNode specified by terminalId. + int getShortcutListHeadPos(const int terminalId) const; + + bool flushToFile(FILE *const file) const { + return flush(file); + } + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent); + + bool createNewShortcutList(const int terminalId); + + bool copyShortcutList(const int shortcutListPos, const int toPos); + + bool setProbability(const int probability, const int shortcutEntryPos); + + bool writeShortcutEntry(const int *const codePoint, const int codePointCount, + const int probability, const bool hasNext, const int shortcutEntryPos) { + int writingPos = shortcutEntryPos; + return writeShortcutEntryAndAdvancePosition(codePoint, codePointCount, probability, + hasNext, &writingPos); + } + + bool writeShortcutEntryAndAdvancePosition(const int *const codePoint, + const int codePointCount, const int probability, const bool hasNext, + int *const shortcutEntryPos); + + int findShortcutEntryAndGetPos(const int shortcutListPos, + const int *const targetCodePointsToFind, const int codePointCount) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); + + bool copyShortcutListFromDictContent(const int shortcutListPos, + const ShortcutDictContent *const sourceShortcutDictContent, const int toPos); + + int createAndGetShortcutFlags(const int probability, const bool hasNext) const; +}; +} // namespace latinime +#endif /* LATINIME_SHORTCUT_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/single_dict_content.h b/app/src/main/jni/src/dictionary/structure/v4/content/single_dict_content.h new file mode 100644 index 000000000..6faa9a28b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/single_dict_content.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SINGLE_DICT_CONTENT_H +#define LATINIME_SINGLE_DICT_CONTENT_H + +#include + +#include "defines.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class SingleDictContent { + public: + SingleDictContent(const ReadWriteByteArrayView buffer) + : mExpandableContentBuffer(buffer, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} + + SingleDictContent() + : mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE) {} + + virtual ~SingleDictContent() {} + + bool isNearSizeLimit() const { + return mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + BufferWithExtendableBuffer *getWritableBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(FILE *const file) const { + return DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer); + } + + private: + DISALLOW_COPY_AND_ASSIGN(SingleDictContent); + + BufferWithExtendableBuffer mExpandableContentBuffer; +}; +} // namespace latinime +#endif /* LATINIME_SINGLE_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/app/src/main/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp new file mode 100644 index 000000000..685365f36 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/sparse_table_dict_content.h" + +#include "dictionary/utils/dict_file_writing_utils.h" + +namespace latinime { + +const int SparseTableDictContent::LOOKUP_TABLE_BUFFER_INDEX = 0; +const int SparseTableDictContent::ADDRESS_TABLE_BUFFER_INDEX = 1; +const int SparseTableDictContent::CONTENT_BUFFER_INDEX = 2; + +bool SparseTableDictContent::flush(FILE *const file) const { + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableLookupTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableAddressTableBuffer)) { + return false; + } + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableContentBuffer)) { + return false; + } + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h b/app/src/main/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h new file mode 100644 index 000000000..6245abc8e --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SPARSE_TABLE_DICT_CONTENT_H +#define LATINIME_SPARSE_TABLE_DICT_CONTENT_H + +#include + +#include "defines.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/sparse_table.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +// TODO: Support multiple contents. +class SparseTableDictContent { + public: + AK_FORCE_INLINE SparseTableDictContent(const ReadWriteByteArrayView *const buffers, + const int sparseTableBlockSize, const int sparseTableDataSize) + : mExpandableLookupTableBuffer(buffers[LOOKUP_TABLE_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableAddressTableBuffer(buffers[ADDRESS_TABLE_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableContentBuffer(buffers[CONTENT_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize) {} + + SparseTableDictContent(const int sparseTableBlockSize, const int sparseTableDataSize) + : mExpandableLookupTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableAddressTableBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, + sparseTableBlockSize, sparseTableDataSize) {} + + virtual ~SparseTableDictContent() {} + + bool isNearSizeLimit() const { + return mExpandableLookupTableBuffer.isNearSizeLimit() + || mExpandableAddressTableBuffer.isNearSizeLimit() + || mExpandableContentBuffer.isNearSizeLimit(); + } + + protected: + SparseTable *getUpdatableAddressLookupTable() { + return &mAddressLookupTable; + } + + const SparseTable *getAddressLookupTable() const { + return &mAddressLookupTable; + } + + BufferWithExtendableBuffer *getWritableContentBuffer() { + return &mExpandableContentBuffer; + } + + const BufferWithExtendableBuffer *getContentBuffer() const { + return &mExpandableContentBuffer; + } + + bool flush(FILE *const file) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTableDictContent); + + static const int LOOKUP_TABLE_BUFFER_INDEX; + static const int ADDRESS_TABLE_BUFFER_INDEX; + static const int CONTENT_BUFFER_INDEX; + + BufferWithExtendableBuffer mExpandableLookupTableBuffer; + BufferWithExtendableBuffer mExpandableAddressTableBuffer; + BufferWithExtendableBuffer mExpandableContentBuffer; + SparseTable mAddressLookupTable; +}; +} // namespace latinime +#endif /* LATINIME_SPARSE_TABLE_DICT_CONTENT_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp b/app/src/main/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp new file mode 100644 index 000000000..5503151fd --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) const { + if (terminalId < 0 || terminalId >= mSize) { + return NOT_A_DICT_POS; + } + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); + return (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) ? + NOT_A_DICT_POS : terminalPos; +} + +bool TerminalPositionLookupTable::setTerminalPtNodePosition( + const int terminalId, const int terminalPtNodePos) { + if (terminalId < 0) { + return false; + } + while (terminalId >= mSize) { + // Write new entry. + if (!getWritableBuffer()->writeUint(Ver4DictConstants::NOT_A_TERMINAL_ADDRESS, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(mSize))) { + return false; + } + mSize++; + } + const int terminalPos = (terminalPtNodePos != NOT_A_DICT_POS) ? + terminalPtNodePos : Ver4DictConstants::NOT_A_TERMINAL_ADDRESS; + return getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId)); +} + +bool TerminalPositionLookupTable::flushToFile(FILE *const file) const { + // If the used buffer size is smaller than the actual buffer size, regenerate the lookup + // table and write the new table to the file. + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + TerminalPositionLookupTable lookupTableToWrite; + for (int i = 0; i < mSize; ++i) { + const int terminalPtNodePosition = getTerminalPtNodePosition(i); + if (!lookupTableToWrite.setTerminalPtNodePosition(i, terminalPtNodePosition)) { + AKLOGE("Cannot set terminal position to lookupTableToWrite." + " terminalId: %d, position: %d", i, terminalPtNodePosition); + return false; + } + } + return lookupTableToWrite.flush(file); + } else { + // We can simply use this lookup table because the buffer size has not been + // changed. + return flush(file); + } +} + +bool TerminalPositionLookupTable::runGCTerminalIds(TerminalIdMap *const terminalIdMap) { + int removedEntryCount = 0; + int nextNewTerminalId = 0; + for (int i = 0; i < mSize; ++i) { + const int terminalPos = getBuffer()->readUint( + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(i)); + if (terminalPos == Ver4DictConstants::NOT_A_TERMINAL_ADDRESS) { + // This entry is a garbage. + removedEntryCount++; + } else { + // Give a new terminal id to the entry. + if (!getWritableBuffer()->writeUint(terminalPos, + Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, + getEntryPos(nextNewTerminalId))) { + return false; + } + // Memorize the mapping to the old terminal id to the new terminal id. + terminalIdMap->insert(TerminalIdMap::value_type(i, nextNewTerminalId)); + nextNewTerminalId++; + } + } + mSize = nextNewTerminalId; + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h b/app/src/main/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h new file mode 100644 index 000000000..f45ceb52d --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H +#define LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H + +#include +#include + +#include "defines.h" +#include "dictionary/structure/v4/content/single_dict_content.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class TerminalPositionLookupTable : public SingleDictContent { + public: + typedef std::unordered_map TerminalIdMap; + + TerminalPositionLookupTable(const ReadWriteByteArrayView buffer) + : SingleDictContent(buffer), + mSize(getBuffer()->getTailPosition() + / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} + + TerminalPositionLookupTable() : mSize(0) {} + + int getTerminalPtNodePosition(const int terminalId) const; + + bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos); + + int getNextTerminalId() const { + return mSize; + } + + bool flushToFile(FILE *const file) const; + + bool runGCTerminalIds(TerminalIdMap *const terminalIdMap); + + private: + DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); + + int getEntryPos(const int terminalId) const { + return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + } + + int mSize; +}; +} // namespace latinime +#endif // LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H diff --git a/app/src/main/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h b/app/src/main/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h new file mode 100644 index 000000000..25ab22543 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_SHORTCUT_LIST_POLICY_H +#define LATINIME_VER4_SHORTCUT_LIST_POLICY_H + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/structure/v4/content/shortcut_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" + +namespace latinime { + +class Ver4ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + Ver4ShortcutListPolicy(ShortcutDictContent *const shortcutDictContent, + const TerminalPositionLookupTable *const terminalPositionLookupTable) + : mShortcutDictContent(shortcutDictContent) {} + + ~Ver4ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + // The first shortcut entry is located at the head position of the shortcut list. + return pos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + int probability = 0; + mShortcutDictContent->getShortcutEntryAndAdvancePosition(maxCodePointCount, + outCodePoint, outCodePointCount, &probability, outHasNext, pos); + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(probability); + } + } + + void skipAllShortcuts(int *const pos) const { + // Do nothing because we don't need to skip shortcut lists in ver4 dictionaries. + } + + bool addNewShortcut(const int terminalId, const int *const codePoints, const int codePointCount, + const int probability) { + const int shortcutListPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (shortcutListPos == NOT_A_DICT_POS) { + // Create shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + const int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + return mShortcutDictContent->writeShortcutEntry(codePoints, codePointCount, probability, + false /* hasNext */, writingPos); + } + const int entryPos = mShortcutDictContent->findShortcutEntryAndGetPos(shortcutListPos, + codePoints, codePointCount); + if (entryPos == NOT_A_DICT_POS) { + // Add new entry to the shortcut list. + // Create new shortcut list. + if (!mShortcutDictContent->createNewShortcutList(terminalId)) { + AKLOGE("Cannot create new shortcut list. terminal id: %d", terminalId); + return false; + } + int writingPos = mShortcutDictContent->getShortcutListHeadPos(terminalId); + if (!mShortcutDictContent->writeShortcutEntryAndAdvancePosition(codePoints, + codePointCount, probability, true /* hasNext */, &writingPos)) { + AKLOGE("Cannot write shortcut entry. terminal id: %d, pos: %d", terminalId, + writingPos); + return false; + } + return mShortcutDictContent->copyShortcutList(shortcutListPos, writingPos); + } + // Overwrite existing entry. + bool hasNext = false; + mShortcutDictContent->getShortcutEntry(MAX_WORD_LENGTH, 0 /* outCodePoint */, + 0 /* outCodePointCount */ , 0 /* probability */, &hasNext, entryPos); + if (!mShortcutDictContent->writeShortcutEntry(codePoints, + codePointCount, probability, hasNext, entryPos)) { + AKLOGE("Cannot overwrite shortcut entry. terminal id: %d, pos: %d", terminalId, + entryPos); + return false; + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4ShortcutListPolicy); + + ShortcutDictContent *const mShortcutDictContent; +}; +} // namespace latinime +#endif // LATINIME_VER4_SHORTCUT_LIST_POLICY_H diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp new file mode 100644 index 000000000..b0a82839b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_dict_buffers.h" + +#include +#include +#include +#include +#include +#include + +#include "dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( + const char *const dictPath, MmappedBuffer::MmappedBufferPtr &&headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion) { + if (!headerBuffer) { + ASSERT(false); + AKLOGE("The header buffer must be valid to open ver4 dict buffers."); + return Ver4DictBuffersPtr(nullptr); + } + // TODO: take only dictDirPath, and open both header and trie files in the constructor below + const bool isUpdatable = headerBuffer->isUpdatable(); + MmappedBuffer::MmappedBufferPtr bodyBuffer = MmappedBuffer::openBuffer(dictPath, + Ver4DictConstants::BODY_FILE_EXTENSION, isUpdatable); + if (!bodyBuffer) { + return Ver4DictBuffersPtr(nullptr); + } + std::vector buffers; + const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView(); + int position = 0; + while (position < static_cast(buffer.size())) { + const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition( + buffer.data(), &position); + buffers.push_back(buffer.subView(position, bufferSize)); + position += bufferSize; + if (bufferSize < 0 || position < 0 || position > static_cast(buffer.size())) { + AKLOGE("The dict body file is corrupted."); + return Ver4DictBuffersPtr(nullptr); + } + } + if (buffers.size() != Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE) { + AKLOGE("The dict body file is corrupted."); + return Ver4DictBuffersPtr(nullptr); + } + return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer), + formatVersion, buffers)); +} + +bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const { + // Create temporary directory. + const int tmpDirPathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + char tmpDirPath[tmpDirPathBufSize]; + FileUtils::getFilePathWithSuffix(dictDirPath, + DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE, tmpDirPathBufSize, + tmpDirPath); + if (FileUtils::existsDir(tmpDirPath)) { + if (!FileUtils::removeDirAndFiles(tmpDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", tmpDirPath); + ASSERT(false); + return false; + } + } + umask(S_IWGRP | S_IWOTH); + if (mkdir(tmpDirPath, S_IRWXU) == -1) { + AKLOGE("Cannot create directory: %s. errno: %d.", tmpDirPath, errno); + return false; + } + // Get dictionary base path. + const int dictNameBufSize = strlen(dictDirPath) + 1 /* terminator */; + char dictName[dictNameBufSize]; + FileUtils::getBasename(dictDirPath, dictNameBufSize, dictName); + const int dictPathBufSize = FileUtils::getFilePathBufSize(tmpDirPath, dictName); + char dictPath[dictPathBufSize]; + FileUtils::getFilePath(tmpDirPath, dictName, dictPathBufSize, dictPath); + + // Write header file. + if (!DictFileWritingUtils::flushBufferToFileWithSuffix(dictPath, + Ver4DictConstants::HEADER_FILE_EXTENSION, headerBuffer)) { + AKLOGE("Dictionary header file %s%s cannot be written.", tmpDirPath, + Ver4DictConstants::HEADER_FILE_EXTENSION); + return false; + } + + // Write body file. + const int bodyFilePathBufSize = FileUtils::getFilePathWithSuffixBufSize(dictPath, + Ver4DictConstants::BODY_FILE_EXTENSION); + char bodyFilePath[bodyFilePathBufSize]; + FileUtils::getFilePathWithSuffix(dictPath, Ver4DictConstants::BODY_FILE_EXTENSION, + bodyFilePathBufSize, bodyFilePath); + + const int fd = open(bodyFilePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd == -1) { + AKLOGE("File %s cannot be opened. errno: %d", bodyFilePath, errno); + ASSERT(false); + return false; + } + FILE *const file = fdopen(fd, "wb"); + if (!file) { + AKLOGE("fdopen failed for the file %s. errno: %d", bodyFilePath, errno); + ASSERT(false); + return false; + } + + if (!flushDictBuffers(file)) { + fclose(file); + return false; + } + fclose(file); + // Remove existing dictionary. + if (!FileUtils::removeDirAndFiles(dictDirPath)) { + AKLOGE("Existing directory %s cannot be removed.", dictDirPath); + ASSERT(false); + return false; + } + // Rename temporary directory. + if (rename(tmpDirPath, dictDirPath) != 0) { + AKLOGE("%s cannot be renamed to %s", tmpDirPath, dictDirPath); + ASSERT(false); + return false; + } + return true; +} + +bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const { + // Write trie. + if (!DictFileWritingUtils::writeBufferToFileTail(file, &mExpandableTrieBuffer)) { + AKLOGE("Trie cannot be written."); + return false; + } + // Write terminal position lookup table. + if (!mTerminalPositionLookupTable.flushToFile(file)) { + AKLOGE("Terminal position lookup table cannot be written."); + return false; + } + // Write language model content. + if (!mLanguageModelDictContent.save(file)) { + AKLOGE("Language model dict content cannot be written."); + return false; + } + // Write shortcut dict content. + if (!mShortcutDictContent.flushToFile(file)) { + AKLOGE("Shortcut dict content cannot be written."); + return false; + } + return true; +} + +Ver4DictBuffers::Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, + MmappedBuffer::MmappedBufferPtr &&bodyBuffer, + const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &contentBuffers) + : mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(std::move(bodyBuffer)), + mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion), + mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(), + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mExpandableTrieBuffer(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX], + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), + mTerminalPositionLookupTable( + contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]), + mLanguageModelDictContent(&contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX], + mHeaderPolicy.hasHistoricalInfoOfWords()), + mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]), + mIsUpdatable(mDictBuffer->isUpdatable()) {} + +Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) + : mHeaderBuffer(nullptr), mDictBuffer(nullptr), mHeaderPolicy(headerPolicy), + mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), + mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), + mLanguageModelDictContent(headerPolicy->hasHistoricalInfoOfWords()), + mShortcutDictContent(), mIsUpdatable(true) {} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_buffers.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_buffers.h new file mode 100644 index 000000000..c8270c93c --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_buffers.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_DICT_BUFFER_H +#define LATINIME_VER4_DICT_BUFFER_H + +#include +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/v4/content/language_model_dict_content.h" +#include "dictionary/structure/v4/content/shortcut_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class Ver4DictBuffers { + public: + typedef std::unique_ptr Ver4DictBuffersPtr; + + static Ver4DictBuffersPtr openVer4DictBuffers(const char *const dictDirPath, + MmappedBuffer::MmappedBufferPtr &&headerBuffer, + const FormatUtils::FORMAT_VERSION formatVersion); + + static AK_FORCE_INLINE Ver4DictBuffersPtr createVer4DictBuffers( + const HeaderPolicy *const headerPolicy, const int maxTrieSize) { + return Ver4DictBuffersPtr(new Ver4DictBuffers(headerPolicy, maxTrieSize)); + } + + AK_FORCE_INLINE bool isValid() const { + return mHeaderBuffer && mDictBuffer && mHeaderPolicy.isValid(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mExpandableTrieBuffer.isNearSizeLimit() + || mTerminalPositionLookupTable.isNearSizeLimit() + || mLanguageModelDictContent.isNearSizeLimit() + || mShortcutDictContent.isNearSizeLimit(); + } + + AK_FORCE_INLINE const HeaderPolicy *getHeaderPolicy() const { + return &mHeaderPolicy; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableHeaderBuffer() { + return &mExpandableHeaderBuffer; + } + + AK_FORCE_INLINE BufferWithExtendableBuffer *getWritableTrieBuffer() { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE const BufferWithExtendableBuffer *getTrieBuffer() const { + return &mExpandableTrieBuffer; + } + + AK_FORCE_INLINE TerminalPositionLookupTable *getMutableTerminalPositionLookupTable() { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE const TerminalPositionLookupTable *getTerminalPositionLookupTable() const { + return &mTerminalPositionLookupTable; + } + + AK_FORCE_INLINE LanguageModelDictContent *getMutableLanguageModelDictContent() { + return &mLanguageModelDictContent; + } + + AK_FORCE_INLINE const LanguageModelDictContent *getLanguageModelDictContent() const { + return &mLanguageModelDictContent; + } + + AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { + return &mShortcutDictContent; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + bool flush(const char *const dictDirPath) const { + return flushHeaderAndDictBuffers(dictDirPath, &mExpandableHeaderBuffer); + } + + bool flushHeaderAndDictBuffers(const char *const dictDirPath, + const BufferWithExtendableBuffer *const headerBuffer) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4DictBuffers); + + Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, + MmappedBuffer::MmappedBufferPtr &&bodyBuffer, + const FormatUtils::FORMAT_VERSION formatVersion, + const std::vector &contentBuffers); + + Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); + + bool flushDictBuffers(FILE *const file) const; + + const MmappedBuffer::MmappedBufferPtr mHeaderBuffer; + const MmappedBuffer::MmappedBufferPtr mDictBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mExpandableHeaderBuffer; + BufferWithExtendableBuffer mExpandableTrieBuffer; + TerminalPositionLookupTable mTerminalPositionLookupTable; + LanguageModelDictContent mLanguageModelDictContent; + ShortcutDictContent mShortcutDictContent; + const int mIsUpdatable; +}; +} // namespace latinime +#endif /* LATINIME_VER4_DICT_BUFFER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp new file mode 100644 index 000000000..fd6907824 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +const char *const Ver4DictConstants::BODY_FILE_EXTENSION = ".body"; +const char *const Ver4DictConstants::HEADER_FILE_EXTENSION = ".header"; + +// Version 4 dictionary size is implicitly limited to 8MB due to 3-byte offsets. +const int Ver4DictConstants::MAX_DICTIONARY_SIZE = 8 * 1024 * 1024; +// Extended region size, which is not GCed region size in dict file + additional buffer size, is +// limited to 1MB to prevent from inefficient traversing. +const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; + +// NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT for Trie and TerminalAddressLookupTable. +// NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT for language model. +// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for shortcut. +const size_t Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE = + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT * 2 + + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT + + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; +const int Ver4DictConstants::TRIE_BUFFER_INDEX = 0; +const int Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX = + TRIE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; +const int Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX = + TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; +const int Ver4DictConstants::SHORTCUT_BUFFERS_INDEX = + LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; + +const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; +const int Ver4DictConstants::PROBABILITY_SIZE = 1; +const int Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE = 1; +const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; +const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; +const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; +const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2; + +const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1; +const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2; +const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4; +const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8; +const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10; + +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; +const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; + +const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; +const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; +const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; + +const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT = 1; +const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT = 3; +const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 2; + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_constants.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_constants.h new file mode 100644 index 000000000..13d7a5714 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_dict_constants.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_DICT_CONSTANTS_H +#define LATINIME_VER4_DICT_CONSTANTS_H + +#include "defines.h" + +#include +#include + +namespace latinime { + +// TODO: Create PtConstants under the pt_common and move some constant values there. +// Note that there are corresponding definitions in FormatSpec.java. +class Ver4DictConstants { + public: + static const char *const BODY_FILE_EXTENSION; + static const char *const HEADER_FILE_EXTENSION; + static const int MAX_DICTIONARY_SIZE; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + + static const size_t NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE; + static const int TRIE_BUFFER_INDEX; + static const int TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX; + static const int LANGUAGE_MODEL_BUFFER_INDEX; + static const int BIGRAM_BUFFERS_INDEX; + static const int SHORTCUT_BUFFERS_INDEX; + + static const int NOT_A_TERMINAL_ID; + static const int PROBABILITY_SIZE; + static const int FLAGS_IN_LANGUAGE_MODEL_SIZE; + static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; + static const int NOT_A_TERMINAL_ADDRESS; + static const int TERMINAL_ID_FIELD_SIZE; + static const int TIME_STAMP_FIELD_SIZE; + // TODO: Remove + static const int WORD_LEVEL_FIELD_SIZE; + static const int WORD_COUNT_FIELD_SIZE; + // Flags in probability entry. + static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + static const uint8_t FLAG_NOT_A_VALID_ENTRY; + static const uint8_t FLAG_NOT_A_WORD; + static const uint8_t FLAG_BLACKLISTED; + static const uint8_t FLAG_POSSIBLY_OFFENSIVE; + + static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; + static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; + + static const int SHORTCUT_FLAGS_FIELD_SIZE; + static const int SHORTCUT_PROBABILITY_MASK; + static const int SHORTCUT_HAS_NEXT_MASK; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4DictConstants); + + static const size_t NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; + static const size_t NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; + static const size_t NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; +}; +} // namespace latinime +#endif /* LATINIME_VER4_DICT_CONSTANTS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..b38b03dcb --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/content/language_model_dict_content.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int siblingNodePos) const { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + return PtNodeParams(); + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + const int headPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPtReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + dictBuf, &pos); + const int parentPos = + DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); + int codePoints[MAX_WORD_LENGTH]; + // Code point table is not used for ver4 dictionaries. + const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, nullptr /* codePointTable */, codePoints, &pos); + int terminalIdFieldPos = NOT_A_DICT_POS; + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + terminalIdFieldPos = pos; + if (usesAdditionalBuffer) { + terminalIdFieldPos += mBuffer->getOriginalBufferSize(); + } + terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); + } + int childrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + childrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + int childrenPos = DynamicPtReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && childrenPos != NOT_A_DICT_POS) { + childrenPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + // Sibling position is the tail position of original PtNode. + int newSiblingNodePos = (siblingNodePos == NOT_A_DICT_POS) ? pos : siblingNodePos; + // Read destination node if the read node is a moved node. + if (DynamicPtReadingUtils::isMoved(flags)) { + // The destination position is stored at the same place as the parent position. + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); + } else { + return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, + terminalIdFieldPos, terminalId, NOT_A_PROBABILITY, childrenPosFieldPos, childrenPos, + newSiblingNodePos); + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h new file mode 100644 index 000000000..4e5ae3a89 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class HeaderPolicy; +class LanguageModelDictContent; + +/* + * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved + * node and reads node attributes. + */ +class Ver4PatriciaTrieNodeReader : public PtNodeReader { + public: + explicit Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} + + ~Ver4PatriciaTrieNodeReader() {} + + virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const { + return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, + NOT_A_DICT_POS /* siblingNodePos */); + } + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + + const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int siblingNodePos) const; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp new file mode 100644 index 000000000..d974b50f4 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" + +#include "dictionary/header/header_policy.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->isTerminal()) { + // The PtNode is a terminal. Delete entry from the terminal position lookup table. + return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); + } else { + return true; + } +} + +// TODO: Quit using bigramLinkedNodePos. +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */, false /* willBecomeNonTerminal */); + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + // Update flags. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + if (toBeUpdatedPtNodeParams->hasChildren()) { + // Update children's parent position. + mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); + while (!mReadingHelper.isEnd()) { + const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); + int parentOffsetFieldPos = childPtNodeParams.getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( + mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), + &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + mReadingHelper.readNextSiblingNode(childPtNodeParams); + } + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams) { + int pos = toBeUpdatedPtNodeParams->getHeadPos(); + const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mTrieBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + false /* isDeleted */, true /* willBecomeNonTerminal */); + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { + AKLOGE("Cannot update terminal position lookup table. terminal id: %d", + toBeUpdatedPtNodeParams->getTerminalId()); + return false; + } + // Update flags. + int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); + return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, + &writingPos); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty) { + // Update probability and historical information. + // TODO: Update other information in the unigram property. + if (!toBeUpdatedPtNodeParams->isTerminal()) { + return false; + } + const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); + return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntryOfUnigramProperty); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { + if (!toBeUpdatedPtNodeParams->isTerminal()) { + AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); + return false; + } + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getLanguageModelDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + if (originalProbabilityEntry.isValid()) { + *outNeedsToKeepPtNode = true; + return true; + } + if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); + return false; + } + *outNeedsToKeepPtNode = false; + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( + const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { + int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); + return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + newChildrenPosition, &childrenPosFieldPos); +} + +bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId) { + return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, + toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { + return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, + ptNodeWritingPos); +} + +bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( + const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, + int *const ptNodeWritingPos) { + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, + ptNodeWritingPos)) { + return false; + } + // Write probability. + ProbabilityEntry newProbabilityEntry; + const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); + return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( + terminalId, &probabilityEntryOfUnigramProperty); +} + +// TODO: Support counting ngram entries. +bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) { + LanguageModelDictContent *const languageModelDictContent = + mBuffers->getMutableLanguageModelDictContent(); + const ProbabilityEntry probabilityEntry = + languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId); + const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty); + if (!languageModelDictContent->setNgramProbabilityEntry( + prevWordIds, wordId, &probabilityEntryOfNgramProperty)) { + AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d", + prevWordIds[0], prevWordIds.size(), wordId); + return false; + } + if (!probabilityEntry.isValid() && outAddedNewBigram) { + *outAddedNewBigram = true; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, + const int wordId) { + LanguageModelDictContent *const languageModelDictContent = + mBuffers->getMutableLanguageModelDictContent(); + return languageModelDictContent->removeNgramProbabilityEntry(prevWordIds, wordId); +} + +// TODO: Remove when we stop supporting v402 format. +bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { + // Do nothing. + return true; +} + +bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( + const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount) { + int parentPos = toBeUpdatedPtNodeParams->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = toBeUpdatedPtNodeParams->getHeadPos() + + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = + dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { + return false; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability) { + if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), + targetCodePoints, targetCodePointCount, shortcutProbability)) { + AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); + return false; + } + return true; +} + +bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos) { + const int nodePos = *ptNodeWritingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, + 0 /* nodeFlags */, ptNodeWritingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, + ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { + return false; + } + // Write code points + if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, + ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { + return false; + } + int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + if (!ptNodeParams->willBecomeNonTerminal()) { + if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { + terminalId = ptNodeParams->getTerminalId(); + } else if (ptNodeParams->isTerminal()) { + // Write terminal information using a new terminal id. + // Get a new unused terminal id. + terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); + } + } + const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; + if (isTerminal) { + // Update the lookup table. + if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( + terminalId, nodePos)) { + return false; + } + // Write terminal Id. + if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, + Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { + return false; + } + if (outTerminalId) { + *outTerminalId = terminalId; + } + } + // Write children position + if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, + ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { + return false; + } + return updatePtNodeFlags(nodePos, isTerminal, + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); +} + +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal, + const bool hasMultipleChars) { + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */, + false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE); + if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { + AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); + return false; + } + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h new file mode 100644 index 000000000..55856110b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H +#define LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/structure/v4/content/probability_entry.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class HeaderPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PtNodeArrayReader; +class Ver4ShortcutListPolicy; + +/* + * This class is used for helping to writes nodes of ver4 patricia trie. + */ +class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { + public: + Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, + Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader, + const PtNodeArrayReader *const ptNodeArrayReader, + Ver4ShortcutListPolicy *const shortcutPolicy) + : mTrieBuffer(trieBuffer), mBuffers(buffers), + mReadingHelper(ptNodeReader, ptNodeArrayReader), mShortcutPolicy(shortcutPolicy) {} + + virtual ~Ver4PatriciaTrieNodeWriter() {} + + virtual bool markPtNodeAsDeleted(const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool markPtNodeAsMoved(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int movedPos, const int bigramLinkedNodePos); + + virtual bool markPtNodeAsWillBecomeNonTerminal( + const PtNodeParams *const toBeUpdatedPtNodeParams); + + virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams, + const UnigramProperty *const unigramProperty); + + virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( + const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode); + + virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newChildrenPosition); + + bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId); + + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + int *const ptNodeWritingPos); + + virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, + const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); + + virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + + virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); + + virtual bool updateAllBigramEntriesAndDeleteUselessEntries( + const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount); + + virtual bool updateAllPositionFields(const PtNodeParams *const toBeUpdatedPtNodeParams, + const DictPositionRelocationMap *const dictPositionRelocationMap, + int *const outBigramEntryCount); + + virtual bool addShortcutTarget(const PtNodeParams *const ptNodeParams, + const int *const targetCodePoints, const int targetCodePointCount, + const int shortcutProbability); + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); + + bool writePtNodeAndGetTerminalIdAndAdvancePosition( + const PtNodeParams *const ptNodeParams, int *const outTerminalId, + int *const ptNodeWritingPos); + + bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mTrieBuffer; + Ver4DictBuffers *const mBuffers; + DynamicPtReadingHelper mReadingHelper; + Ver4ShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp new file mode 100644 index 000000000..6f96a5a0b --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -0,0 +1,603 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_policy.h" + +#include +#include + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; +const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + +void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); + while (!readingHelper.isEnd()) { + const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); + if (!ptNodeParams.isValid()) { + break; + } + const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); + const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), + wordId, ptNodeParams.getCodePointArrayView()); + readingHelper.readNextSiblingNode(ptNodeParams); + } + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } +} + +int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + readingHelper.initWithPtNodePos(ptNodePos); + const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( + maxCodePointCount, outCodePoints); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); + } + return codePointCount; +} + +int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_WORD_ID; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.isDeleted()) { + return NOT_A_WORD_ID; + } + return ptNodeParams.getTerminalId(); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + false /* mustMatchAllPrevWords */, mHeaderPolicy); +} + +int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { + return NOT_A_PROBABILITY; + } + const WordAttributes wordAttributes = + mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + true /* mustMatchAllPrevWords */, mHeaderPolicy); + if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { + return NOT_A_PROBABILITY; + } + return wordAttributes.getProbability(); +} + +BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( + const int wordId) const { + const int shortcutPos = getShortcutPositionOfWord(wordId); + return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); +} + +void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const { + if (prevWordIds.empty()) { + return; + } + const auto languageModelDictContent = mBuffers->getLanguageModelDictContent(); + for (size_t i = 1; i <= prevWordIds.size(); ++i) { + for (const auto& entry : languageModelDictContent->getProbabilityEntries( + prevWordIds.limit(i))) { + const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry(); + if (!probabilityEntry.isValid()) { + continue; + } + int probability = NOT_A_PROBABILITY; + if (probabilityEntry.hasHistoricalInfo()) { + // TODO: Quit checking count here. + // If count <= 1, the word can be an invaild word. The actual probability should + // be checked using getWordAttributesInContext() in onVisitEntry(). + probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ? + NOT_A_PROBABILITY : 0; + } else { + probability = probabilityEntry.getProbability(); + } + listener->onVisitEntry(probability, entry.getWordId()); + } + } +} + +int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const { + if (wordId == NOT_A_WORD_ID) { + return NOT_A_DICT_POS; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams.getTerminalId()); +} + +bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %zd", + wordCodePoints.size()); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", + shortcut.getTargetCodePoints()->size()); + return false; + } + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + bool addedNewUnigram = false; + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = wordCodePoints.size(); + memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, + &addedNewUnigram)) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { + mEntryCounters.incrementNgramCount(NgramType::Unigram); + } + if (unigramProperty->getShortcuts().size() > 0) { + // Add shortcut target. + const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("Cannot find word id to add shortcut target."); + return false; + } + const int wordPos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (!mUpdatingHelper.addShortcutTarget(wordPos, + CodePointArrayView(*shortcut.getTargetCodePoints()), + shortcut.getProbability())) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " + "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), + shortcut.getProbability()); + return false; + } + } + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { + AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); + return false; + } + if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) { + return false; + } + if (!ptNodeParams.representsNonWordInfo()) { + mEntryCounters.decrementNgramCount(NgramType::Unigram); + } + return true; +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + const NgramContext *const ngramContext = ngramProperty->getNgramContext(); + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); + return false; + } + if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert the ngram to the dictionary. " + "length: %zd", ngramProperty->getTargetCodePoints()->size()); + return false; + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (prevWordIds.empty()) { + return false; + } + for (size_t i = 0; i < prevWordIds.size(); ++i) { + if (prevWordIds[i] != NOT_A_WORD_ID) { + continue; + } + if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) { + return false; + } + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, true /* isNotAWord */, + false /* isBlacklisted */, false /* isPossiblyOffensive */, + MAX_PROBABILITY /* probability */, HistoricalInfo()); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()), + false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + bool addedNewEntry = false; + if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { + if (addedNewEntry) { + mEntryCounters.incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", + wordCodePoints.size()); + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSerch */); + if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) { + return false; + } + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { + mEntryCounters.decrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( + const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, + const bool isValidWord, const HistoricalInfo historicalInfo) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? + false : isValidWord; + int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + // The word is not in the dictionary. + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, + NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, + 0 /* count */)); + if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { + AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + if (!isValidWord) { + return true; + } + wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); + } + + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, + true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, + HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + // Update entries for beginning of sentence. + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( + prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, + mHeaderPolicy, &mEntryCounters)) { + return false; + } + } + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, + wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); + return false; + } + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { + AKLOGE("Cannot flush the dictionary to file."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { + AKLOGE("Cannot flush the dictionary to file with GC."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBuffers->isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() + > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size of the trie exceeds the limit. + return true; + } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mDictBuffer->getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy->isDecayingDict()) { + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), + mHeaderPolicy); + } + return false; +} + +void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, + char *const outResult, const int maxResultLength) { + const int compareLength = queryLength + 1 /* terminator */; + if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mEntryCounters.getNgramCount(NgramType::Unigram)); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Unigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Bigram)) : + static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } +} + +const WordProperty Ver4PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); + } + const LanguageModelDictContent *const languageModelDictContent = + mBuffers->getLanguageModelDictContent(); + // Fetch ngram information. + std::vector ngrams; + int ngramTargetCodePoints[MAX_WORD_LENGTH]; + int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (const auto& entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord( + mHeaderPolicy, wordId)) { + const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(), + MAX_WORD_LENGTH, ngramTargetCodePoints); + const WordIdArrayView prevWordIds = entry.getPrevWordIds(); + for (size_t i = 0; i < prevWordIds.size(); ++i) { + ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i], + MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]); + ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry( + prevWordIds[i]).representsBeginningOfSentence(); + if (ngramPrevWordIsBeginningOfSentense[i]) { + ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker( + ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]); + } + } + const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount, + ngramPrevWordIsBeginningOfSentense, prevWordIds.size()); + const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry(); + const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo(); + // TODO: Output flags in WordAttributes. + ngrams.emplace_back(ngramContext, + CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(), + entry.getWordAttributes().getProbability(), *historicalInfo); + } + // Fetch shortcut information. + std::vector shortcuts; + int shortcutPos = getShortcutPositionOfWord(wordId); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers->getShortcutDictContent(); + bool hasNext = true; + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + shortcuts.emplace_back( + CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), + shortcutProbability); + } + } + const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( + WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); + const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), + wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), + wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), + *historicalInfo, std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + const PtNodeParams ptNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos); + *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(), + MAX_WORD_LENGTH, outCodePoints); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h new file mode 100644 index 000000000..d130a4e78 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H +#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H + +#include + +#include "defines.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +// Word id = Artificial id that is stored in the PtNode looked up by the word. +class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) + : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), + mDictBuffer(mBuffers->getWritableTrieBuffer()), + mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()), + mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer), + mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader, + &mShortcutPolicy), + mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), + mWritingHelper(mBuffers.get()), + mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), + mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; + + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; + + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; + + // TODO: Remove + int getProbability(const int unigramProbability, const int bigramProbability) const { + // Not used. + return NOT_A_PROBABILITY; + } + + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; + + void iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const; + + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return mHeaderPolicy; + } + + bool addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty); + + bool removeUnigramEntry(const CodePointArrayView wordCodePoints); + + bool addNgramEntry(const NgramProperty *const ngramProperty); + + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints); + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); + + bool flush(const char *const filePath); + + bool flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC) const; + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength); + + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; + + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); + + bool isCorrupted() const { + return mIsCorrupted; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); + + static const char *const UNIGRAM_COUNT_QUERY; + static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; + // When the dictionary size is near the maximum size, we have to refuse dynamic operations to + // prevent the dictionary from overflowing. + static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + + const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; + const HeaderPolicy *const mHeaderPolicy; + BufferWithExtendableBuffer *const mDictBuffer; + Ver4ShortcutListPolicy mShortcutPolicy; + Ver4PatriciaTrieNodeReader mNodeReader; + Ver4PtNodeArrayReader mPtNodeArrayReader; + Ver4PatriciaTrieNodeWriter mNodeWriter; + DynamicPtUpdatingHelper mUpdatingHelper; + Ver4PatriciaTrieWritingHelper mWritingHelper; + MutableEntryCounters mEntryCounters; + std::vector mTerminalPtNodePositionsForIteratingWords; + mutable bool mIsCorrupted; + + int getShortcutPositionOfWord(const int wordId) const; +}; +} // namespace latinime +#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..ccb70cdd3 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +/* static */ int Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition( + const uint8_t *const buffer, int *pos) { + return ByteArrayUtils::readUint32AndAdvancePosition(buffer, pos); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h new file mode 100644 index 000000000..466ff55d5 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class Ver4PatriciaTrieReadingUtils { + public: + static int getTerminalIdAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieReadingUtils); +}; +} // namespace latinime +#endif /* LATINIME_VER4_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..6dfdf4d31 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" + +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, + const EntryCounts &entryCounts) const { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + entryCounts, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d," + "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), + entryCounts.getNgramCount(NgramType::Bigram), + entryCounts.getNgramCount(NgramType::Trigram), + extendedRegionSize); + return false; + } + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const dictDirPath) { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( + Ver4DictBuffers::createVer4DictBuffers(headerPolicy, + Ver4DictConstants::MAX_DICTIONARY_SIZE)); + MutableEntryCounters entryCounters; + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &entryCounters)) { + return false; + } + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { + return false; + } + return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, + MutableEntryCounters *const outEntryCounters) { + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer()); + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), + mBuffers, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); + + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllProbabilityEntriesForGC( + headerPolicy, outEntryCounters)) { + AKLOGE("Failed to update probabilities in language model dict content."); + return false; + } + if (headerPolicy->isDecayingDict()) { + const EntryCounts &maxEntryCounts = headerPolicy->getMaxNgramCounts(); + if (!mBuffers->getMutableLanguageModelDictContent()->truncateEntries( + outEntryCounters->getEntryCounts(), maxEntryCounts, headerPolicy, + outEntryCounters)) { + AKLOGE("Failed to truncate entries in language model dict content."); + return false; + } + } + + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + &ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instances for the GCed dictionary. + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer()); + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), + buffersToWrite->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader, + &newShortcutPolicy); + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + // Run GC for language model dict content. + if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap, + mBuffers->getLanguageModelDictContent())) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { + return false; + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + return false; + } + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h new file mode 100644 index 000000000..68dd1caa2 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/utils/entry_counters.h" + +namespace latinime { + +class HeaderPolicy; +class Ver4DictBuffers; +class Ver4PatriciaTrieNodeReader; +class Ver4PatriciaTrieNodeWriter; + +class Ver4PatriciaTrieWritingHelper { + public: + Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) + : mBuffers(buffers) {} + + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; + + // This method cannot be const because the original dictionary buffer will be updated to detect + // useless PtNodes during GC. + bool writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const dictDirPath); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); + + class TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + : public DynamicPtReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds( + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) + : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds); + + Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; + const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; + }; + + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, + Ver4DictBuffers *const buffersToWrite, MutableEntryCounters *const outEntryCounters); + + Ver4DictBuffers *const mBuffers; +}; +} // namespace latinime + +#endif /* LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp b/app/src/main/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp new file mode 100644 index 000000000..63d0b4ad5 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" + +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of a bug or a broken dictionary. + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", + ptNodeArrayPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = ptNodeArrayPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &readingPos); + if (usesAdditionalBuffer) { + readingPos += mBuffer->getOriginalBufferSize(); + } + if (ptNodeCountInArray < 0) { + AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); + return false; + } + *outPtNodeCount = ptNodeCountInArray; + *outFirstPtNodePos = readingPos; + return true; +} + +bool Ver4PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const { + if (forwordLinkPos < 0 || forwordLinkPos >= mBuffer->getTailPosition()) { + // Reading invalid position because of bug or broken dictionary. + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", + forwordLinkPos, mBuffer->getTailPosition()); + ASSERT(false); + return false; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(forwordLinkPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int readingPos = forwordLinkPos; + if (usesAdditionalBuffer) { + readingPos -= mBuffer->getOriginalBufferSize(); + } + const int nextPtNodeArrayOffset = + DynamicPtReadingUtils::getForwardLinkPosition(dictBuf, readingPos); + if (DynamicPtReadingUtils::isValidForwardLinkPosition(nextPtNodeArrayOffset)) { + *outNextPtNodeArrayPos = forwordLinkPos + nextPtNodeArrayOffset; + } else { + *outNextPtNodeArrayPos = NOT_A_DICT_POS; + } + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h b/app/src/main/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h new file mode 100644 index 000000000..ccb760bc1 --- /dev/null +++ b/app/src/main/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_VER4_PT_NODE_ARRAY_READER_H +#define LATINIME_VER4_PT_NODE_ARRAY_READER_H + +#include "defines.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class Ver4PtNodeArrayReader : public PtNodeArrayReader { + public: + Ver4PtNodeArrayReader(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {}; + + virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, + int *const outPtNodeCount, int *const outFirstPtNodePos) const; + virtual bool readForwardLinkAndReturnIfValid(const int forwordLinkPos, + int *const outNextPtNodeArrayPos) const; + + private: + DISALLOW_COPY_AND_ASSIGN(Ver4PtNodeArrayReader); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace latinime +#endif /* LATINIME_VER4_PT_NODE_ARRAY_READER_H */ diff --git a/app/src/main/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h b/app/src/main/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h new file mode 100644 index 000000000..a30336475 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryBigramsIterator { + public: + // Empty iterator. + BinaryDictionaryBigramsIterator() + : mBigramsStructurePolicy(nullptr), mPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(false) {} + + BinaryDictionaryBigramsIterator( + const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos) + : mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mHasNext(pos != NOT_A_DICT_POS) {} + + BinaryDictionaryBigramsIterator(BinaryDictionaryBigramsIterator &&bigramsIterator) noexcept + : mBigramsStructurePolicy(bigramsIterator.mBigramsStructurePolicy), + mPos(bigramsIterator.mPos), mBigramPos(bigramsIterator.mBigramPos), + mProbability(bigramsIterator.mProbability), mHasNext(bigramsIterator.mHasNext) {} + + AK_FORCE_INLINE bool hasNext() const { + return mHasNext; + } + + AK_FORCE_INLINE void next() { + mBigramsStructurePolicy->getNextBigram(&mBigramPos, &mProbability, &mHasNext, &mPos); + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + AK_FORCE_INLINE int getBigramPos() const { + return mBigramPos; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator); + + const DictionaryBigramsStructurePolicy *const mBigramsStructurePolicy; + int mPos; + int mBigramPos; + int mProbability; + bool mHasNext; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H diff --git a/app/src/main/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h b/app/src/main/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h new file mode 100644 index 000000000..e14805e37 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryShortcutIterator { + public: + BinaryDictionaryShortcutIterator( + const DictionaryShortcutsStructurePolicy *const shortcutStructurePolicy, + const int shortcutPos) + : mShortcutStructurePolicy(shortcutStructurePolicy), + mPos(shortcutStructurePolicy->getStartPos(shortcutPos)), + mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {} + + BinaryDictionaryShortcutIterator(const BinaryDictionaryShortcutIterator &&shortcutIterator) noexcept + : mShortcutStructurePolicy(shortcutIterator.mShortcutStructurePolicy), + mPos(shortcutIterator.mPos), + mHasNextShortcutTarget(shortcutIterator.mHasNextShortcutTarget) {} + + AK_FORCE_INLINE bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as an int string and put it to outTarget, put its length + // to outTargetLength, put whether it is whitelist to outIsWhitelist. + AK_FORCE_INLINE void nextShortcutTarget( + const int maxDepth, int *const outTarget, int *const outTargetLength, + bool *const outIsWhitelist) { + mShortcutStructurePolicy->getNextShortcut(maxDepth, outTarget, outTargetLength, + outIsWhitelist, &mHasNextShortcutTarget, &mPos); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(BinaryDictionaryShortcutIterator); + DISALLOW_ASSIGNMENT_OPERATOR(BinaryDictionaryShortcutIterator); + + const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy; + int mPos; + bool mHasNextShortcutTarget; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H diff --git a/app/src/main/jni/src/dictionary/utils/bloom_filter.h b/app/src/main/jni/src/dictionary/utils/bloom_filter.h new file mode 100644 index 000000000..1e60f49ed --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/bloom_filter.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BLOOM_FILTER_H +#define LATINIME_BLOOM_FILTER_H + +#include + +#include "defines.h" + +namespace latinime { + +// This bloom filter is used for optimizing bigram retrieval. +// Execution times with previous word "this" are as follows: +// without bloom filter (use only hash_map): +// Total 147792.34 (sum of others 147771.57) +// with bloom filter: +// Total 145900.64 (sum of others 145874.30) +// always read binary dictionary: +// Total 148603.14 (sum of others 148579.90) +class BloomFilter { + public: + BloomFilter() : mFilter() {} + + AK_FORCE_INLINE void setInFilter(const int position) { + mFilter.set(getIndex(position)); + } + + AK_FORCE_INLINE bool isInFilter(const int position) const { + return mFilter.test(getIndex(position)); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter); + + AK_FORCE_INLINE size_t getIndex(const int position) const { + return static_cast(position) % BIGRAM_FILTER_MODULO; + } + + // Size, in bits, of the bloom filter index for bigrams + // The probability of false positive is (1 - e ** (-kn/m))**k, + // where k is the number of hash functions, n the number of bigrams, and m the number of + // bits we can test. + // At the moment 100 is the maximum number of bigrams for a word with the current main + // dictionaries, so n = 100. 1024 buckets give us m = 1024. + // With 1 hash function, our false positive rate is about 9.3%, which should be enough for + // our uses since we are only using this to increase average performance. For the record, + // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, + // and m = 4096 gives 2.4%. + // This is assigned here because it is used for bitset size. + // 1021 is the largest prime under 1024. + static const size_t BIGRAM_FILTER_MODULO = 1021; + std::bitset mFilter; +}; +} // namespace latinime +#endif // LATINIME_BLOOM_FILTER_H diff --git a/app/src/main/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp b/app/src/main/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp new file mode 100644 index 000000000..217569651 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; +// TODO: Needs to allocate larger memory corresponding to the current vector size. +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; + +uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(pos); + const int posInBuffer = readingPosIsInAdditionalBuffer ? pos - mOriginalBuffer.size() : pos; + return ByteArrayUtils::readUint(getBuffer(readingPosIsInAdditionalBuffer), size, posInBuffer); +} + +uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size, + int *const pos) const { + const uint32_t value = readUint(size, *pos); + *pos += size; + return value; +} + +void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(*pos); + if (readingPosIsInAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + // Code point table is not used for dynamic format. + *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( + getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, + nullptr /* codePointTable */, outCodePoints, pos); + if (readingPosIsInAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } +} + +bool BufferWithExtendableBuffer::extend(const int size) { + return checkAndPrepareWriting(getTailPosition(), size); +} + +bool BufferWithExtendableBuffer::writeUint(const uint32_t data, const int size, const int pos) { + int writingPos = pos; + return writeUintAndAdvancePosition(data, size, &writingPos); +} + +bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, + int *const pos) { + if (!(size >= 1 && size <= 4)) { + AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); + ASSERT(false); + return false; + } + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = + usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); + if (usesAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } + return true; +} + +bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints, + const int codePointCount, const bool writesTerminator, int *const pos) { + const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints( + codePoints, codePointCount, writesTerminator); + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = + usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); + if (usesAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePoints, codePointCount, + writesTerminator, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } + return true; +} + +bool BufferWithExtendableBuffer::extendBuffer(const size_t size) { + const size_t extendSize = std::max(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP, size); + const size_t sizeAfterExtending = + std::min(mAdditionalBuffer.size() + extendSize, mMaxAdditionalBufferSize); + if (sizeAfterExtending < mAdditionalBuffer.size() + size) { + return false; + } + mAdditionalBuffer.resize(sizeAfterExtending); + return true; +} + +bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) { + if (pos < 0 || size < 0) { + // Invalid position or size. + return false; + } + const size_t totalRequiredSize = static_cast(pos + size); + if (!isInAdditionalBuffer(pos)) { + // Here don't need to care about the additional buffer. + if (mOriginalBuffer.size() < totalRequiredSize) { + // Violate the boundary. + return false; + } + // The buffer has sufficient capacity. + return true; + } + // Hereafter, pos is in the additional buffer. + const size_t tailPosition = static_cast(getTailPosition()); + if (totalRequiredSize <= tailPosition) { + // The buffer has sufficient capacity. + return true; + } + if (static_cast(pos) != tailPosition) { + // The additional buffer must be extended from the tail position. + return false; + } + const size_t extendSize = totalRequiredSize - + std::min(mAdditionalBuffer.size() + mOriginalBuffer.size(), totalRequiredSize); + if (extendSize > 0 && !extendBuffer(extendSize)) { + // Failed to extend the buffer. + return false; + } + mUsedAdditionalBufferSize += size; + return true; +} + +bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) { + int copyingPos = 0; + const int tailPos = sourceBuffer->getTailPosition(); + const int maxDataChunkSize = sizeof(uint32_t); + while (copyingPos < tailPos) { + const int remainingSize = tailPos - copyingPos; + const int copyingSize = (remainingSize >= maxDataChunkSize) ? + maxDataChunkSize : remainingSize; + const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos); + if (!writeUint(data, copyingSize, copyingPos)) { + return false; + } + copyingPos += copyingSize; + } + return true; +} + +} diff --git a/app/src/main/jni/src/dictionary/utils/buffer_with_extendable_buffer.h b/app/src/main/jni/src/dictionary/utils/buffer_with_extendable_buffer.h new file mode 100644 index 000000000..0a141d4db --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/buffer_with_extendable_buffer.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H +#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H + +#include +#include +#include + +#include "defines.h" +#include "dictionary/utils/byte_array_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +// This is used as a buffer that can be extended for updatable dictionaries. +// To optimize performance, raw pointer is directly used for reading buffer. The position has to be +// adjusted to access additional buffer. On the other hand, this class does not provide writable +// raw pointer but provides several methods that handle boundary checking for writing data. +class BufferWithExtendableBuffer { + public: + static const size_t DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE; + + BufferWithExtendableBuffer(const ReadWriteByteArrayView originalBuffer, + const int maxAdditionalBufferSize) + : mOriginalBuffer(originalBuffer), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + // Without original buffer. + BufferWithExtendableBuffer(const int maxAdditionalBufferSize) + : mOriginalBuffer(), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + AK_FORCE_INLINE int getTailPosition() const { + return mOriginalBuffer.size() + mUsedAdditionalBufferSize; + } + + AK_FORCE_INLINE int getUsedAdditionalBufferSize() const { + return mUsedAdditionalBufferSize; + } + + /** + * For reading. + */ + AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { + return position >= static_cast(mOriginalBuffer.size()); + } + + // TODO: Resolve the issue that the address can be changed when the vector is resized. + // CAVEAT!: Be careful about array out of bound access with buffers + AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { + if (usesAdditionalBuffer) { + return mAdditionalBuffer.data(); + } else { + return mOriginalBuffer.data(); + } + } + + uint32_t readUint(const int size, const int pos) const; + + uint32_t readUintAndAdvancePosition(const int size, int *const pos) const; + + void readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const; + + AK_FORCE_INLINE int getOriginalBufferSize() const { + return mOriginalBuffer.size(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize + * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100); + } + + bool extend(const int size); + + /** + * For writing. + * + * Writing is allowed for original buffer, already written region of additional buffer and the + * tail of additional buffer. + */ + bool writeUint(const uint32_t data, const int size, const int pos); + + bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos); + + bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, + const bool writesTerminator, int *const pos); + + bool copy(const BufferWithExtendableBuffer *const sourceBuffer); + + private: + DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); + + static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; + static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + + const ReadWriteByteArrayView mOriginalBuffer; + std::vector mAdditionalBuffer; + int mUsedAdditionalBufferSize; + const size_t mMaxAdditionalBufferSize; + + // Return if the buffer is successfully extended or not. + bool extendBuffer(const size_t size); + + // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of + // the additional buffer, try extending the buffer. + bool checkAndPrepareWriting(const int pos, const int size); +}; +} +#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/app/src/main/jni/src/dictionary/utils/byte_array_utils.cpp b/app/src/main/jni/src/dictionary/utils/byte_array_utils.cpp new file mode 100644 index 000000000..d38f08217 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/byte_array_utils.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; +const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/byte_array_utils.h b/app/src/main/jni/src/dictionary/utils/byte_array_utils.h new file mode 100644 index 000000000..abb979050 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/byte_array_utils.h @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_UTILS_H +#define LATINIME_BYTE_ARRAY_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +/** + * Utility methods for reading byte arrays. + */ +class ByteArrayUtils { + public: + /** + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading + * + * Each method read a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) + ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; + } + + static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; + } + + static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 8) ^ buffer[pos + 1]; + } + + static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { + return buffer[pos]; + } + + static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint32(buffer, *pos); + *pos += 4; + return value; + } + + static AK_FORCE_INLINE int readSint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t value = readUint8(buffer, *pos); + if (value < 0x80) { + return readUint24AndAdvancePosition(buffer, pos); + } else { + (*pos)++; + return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); + } + } + + static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint24(buffer, *pos); + *pos += 3; + return value; + } + + static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint16_t value = readUint16(buffer, *pos); + *pos += 2; + return value; + } + + static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return buffer[(*pos)++]; + } + + static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, + const int size, const int pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + return ByteArrayUtils::readUint8(buffer, pos); + case 2: + return ByteArrayUtils::readUint16(buffer, pos); + case 3: + return ByteArrayUtils::readUint24(buffer, pos); + case 4: + return ByteArrayUtils::readUint32(buffer, pos); + default: + return 0; + } + } + + /** + * Code Point Reading + * + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + */ + static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { + int p = pos; + return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); + } + + static AK_FORCE_INLINE int readCodePointAndAdvancePosition( + const uint8_t *const buffer, const int *const codePointTable, int *const pos) { + /* + * codePointTable is an array to convert the most frequent characters in this dictionary to + * 1 byte code points. It is only made of the original code points of the most frequent + * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. + * The original code points are restored by picking the code points at the indices of the + * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. + */ + const uint8_t firstByte = readUint8(buffer, *pos); + if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte == CHARACTER_ARRAY_TERMINATOR) { + *pos += 1; + return NOT_A_CODE_POINT; + } else { + return readUint24AndAdvancePosition(buffer, pos); + } + } else { + *pos += 1; + if (codePointTable) { + return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; + } + return firstByte; + } + } + + /** + * String (array of code points) Reading + * + * Reads code points until the terminator is found. + */ + // Returns the length of the string. + static int readStringAndAdvancePosition(const uint8_t *const buffer, + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + outBuffer[length++] = codePoint; + codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + } + return length; + } + + // Advances the position and returns the length of the string. + static int advancePositionToBehindString( + const uint8_t *const buffer, const int maxLength, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + length++; + } + return length; + } + + /** + * String (array of code points) Writing + */ + static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, + const int *const codePoints, const int codePointCount, const bool writesTerminator, + int *const pos) { + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + writeUint24AndAdvancePosition(buffer, codePoint, pos); + } else { + // one byte character. + writeUint8AndAdvancePosition(buffer, codePoint, pos); + } + } + if (writesTerminator) { + writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); + } + } + + static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, + const int codePointCount, const bool writesTerminator) { + int byteCount = 0; + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + byteCount += 3; + } else { + // one byte character. + byteCount += 1; + } + } + if (writesTerminator) { + // The terminator is one byte. + byteCount += 1; + } + return byteCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); + + static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } +}; +} // namespace latinime +#endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/utils/dict_file_writing_utils.cpp b/app/src/main/jni/src/dictionary/utils/dict_file_writing_utils.cpp new file mode 100644 index 000000000..033a758ba --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/dict_file_writing_utils.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/dict_file_writing_utils.h" + +#include +#include +#include +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/format_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; +// Enough size to describe buffer size. +const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4; + +/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, + const int dictVersion, const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + TimeKeeper::setCurrentTime(); + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); + switch (formatVersion) { + case FormatUtils::VERSION_402: + return createEmptyV4DictFile( + filePath, localeAsCodePointVector, attributeMap, formatVersion); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_403: + return createEmptyV4DictFile( + filePath, localeAsCodePointVector, attributeMap, formatVersion); + default: + AKLOGE("Cannot create dictionary %s because format version %d is not supported.", + filePath, dictVersion); + return false; + } +} + +template +/* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion) { + HeaderPolicy headerPolicy(formatVersion, localeAsCodePointVector, attributeMap); + DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, + DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); + headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); + if (!DynamicPtWritingUtils::writeEmptyDictionary( + dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { + AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); + return false; + } + return dictBuffers->flush(dirPath); +} + +/* static */ bool DictFileWritingUtils::flushBufferToFileWithSuffix(const char *const basePath, + const char *const suffix, const BufferWithExtendableBuffer *const buffer) { + const int filePathBufSize = FileUtils::getFilePathWithSuffixBufSize(basePath, suffix); + char filePath[filePathBufSize]; + FileUtils::getFilePathWithSuffix(basePath, suffix, filePathBufSize, filePath); + return flushBufferToFile(filePath, buffer); +} + +/* static */ bool DictFileWritingUtils::writeBufferToFileTail(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + uint8_t bufferSize[SIZE_OF_BUFFER_SIZE_FIELD]; + int writingPos = 0; + ByteArrayUtils::writeUintAndAdvancePosition(bufferSize, buffer->getTailPosition(), + SIZE_OF_BUFFER_SIZE_FIELD, &writingPos); + if (fwrite(bufferSize, SIZE_OF_BUFFER_SIZE_FIELD, 1 /* count */, file) < 1) { + return false; + } + return writeBufferToFile(file, buffer); +} + +/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer) { + const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd == -1) { + AKLOGE("File %s cannot be opened. errno: %d", filePath, errno); + ASSERT(false); + return false; + } + FILE *const file = fdopen(fd, "wb"); + if (!file) { + AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno); + ASSERT(false); + return false; + } + if (!writeBufferToFile(file, buffer)) { + fclose(file); + remove(filePath); + AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath, + buffer->getTailPosition()); + ASSERT(false); + return false; + } + fclose(file); + return true; +} + +// Returns whether the writing was succeeded or not. +/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + const int originalBufSize = buffer->getOriginalBufferSize(); + if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */), + originalBufSize, 1, file) < 1) { + return false; + } + const int additionalBufSize = buffer->getUsedAdditionalBufferSize(); + if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */), + additionalBufSize, 1, file) < 1) { + return false; + } + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/dict_file_writing_utils.h b/app/src/main/jni/src/dictionary/utils/dict_file_writing_utils.h new file mode 100644 index 000000000..102a89da4 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/dict_file_writing_utils.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H +#define LATINIME_DICT_FILE_WRITING_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DictFileWritingUtils { + public: + static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + + static bool createEmptyDictFile(const char *const filePath, const int dictVersion, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static bool flushBufferToFileWithSuffix(const char *const basePath, const char *const suffix, + const BufferWithExtendableBuffer *const buffer); + + static bool writeBufferToFileTail(FILE *const file, + const BufferWithExtendableBuffer *const buffer); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); + + static const int SIZE_OF_BUFFER_SIZE_FIELD; + + static bool createEmptyV401DictFile(const char *const filePath, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); + + template + static bool createEmptyV4DictFile(const char *const filePath, + const std::vector localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); + + static bool flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer); + + static bool writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer); +}; +} // namespace latinime +#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/utils/entry_counters.h b/app/src/main/jni/src/dictionary/utils/entry_counters.h new file mode 100644 index 000000000..5e443026e --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/entry_counters.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ENTRY_COUNTERS_H +#define LATINIME_ENTRY_COUNTERS_H + +#include + +#include "defines.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +// Copyable but immutable +class EntryCounts final { + public: + EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {} + + explicit EntryCounts(const std::array &counters) + : mEntryCounts(counters) {} + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounts[static_cast(ngramType)]; + } + + const std::array &getCountArray() const { + return mEntryCounts; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts); + + // Counts from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + const std::array mEntryCounts; +}; + +class MutableEntryCounters final { + public: + MutableEntryCounters() { + mEntryCounters.fill(0); + } + + explicit MutableEntryCounters( + const std::array &counters) + : mEntryCounters(counters) {} + + const EntryCounts getEntryCounts() const { + return EntryCounts(mEntryCounters); + } + + void incrementNgramCount(const NgramType ngramType) { + ++mEntryCounters[static_cast(ngramType)]; + } + + void decrementNgramCount(const NgramType ngramType) { + --mEntryCounters[static_cast(ngramType)]; + } + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounters[static_cast(ngramType)]; + } + + void setNgramCount(const NgramType ngramType, const int count) { + mEntryCounters[static_cast(ngramType)] = count; + } + + private: + DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters); + + // Counters from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + std::array mEntryCounters; +}; +} // namespace latinime +#endif /* LATINIME_ENTRY_COUNTERS_H */ diff --git a/app/src/main/jni/src/dictionary/utils/file_utils.cpp b/app/src/main/jni/src/dictionary/utils/file_utils.cpp new file mode 100644 index 000000000..bb392fb32 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/file_utils.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/file_utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace latinime { + +// Returns -1 on error. +/* static */ int FileUtils::getFileSize(const char *const filePath) { + const int fd = open(filePath, O_RDONLY); + if (fd == -1) { + return -1; + } + struct stat statBuf; + if (fstat(fd, &statBuf) != 0) { + close(fd); + return -1; + } + close(fd); + return static_cast(statBuf.st_size); +} + +/* static */ bool FileUtils::existsDir(const char *const dirPath) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + return false; + } + closedir(dir); + return true; +} + +// Remove a directory and all files in the directory. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath) { + return removeDirAndFiles(dirPath, 5 /* maxTries */); +} + +// Remove a directory and all files in the directory, trying up to maxTimes. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath, const int maxTries) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + AKLOGE("Cannot open dir %s.", dirPath); + return true; + } + struct dirent *dirent; + while ((dirent = readdir(dir)) != NULL) { + if (dirent->d_type == DT_DIR) { + continue; + } + if (strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0) { + continue; + } + const int filePathBufSize = getFilePathBufSize(dirPath, dirent->d_name); + char filePath[filePathBufSize]; + getFilePath(dirPath, dirent->d_name, filePathBufSize, filePath); + if (remove(filePath) != 0) { + AKLOGE("Cannot remove file %s.", filePath); + closedir(dir); + return false; + } + } + closedir(dir); + if (remove(dirPath) != 0) { + if (maxTries > 0) { + // On NFS, deleting files sometimes creates new files. I'm not sure what the + // correct way of dealing with this is, but for the time being, this seems to work. + removeDirAndFiles(dirPath, maxTries - 1); + } else { + AKLOGE("Cannot remove directory %s.", dirPath); + return false; + } + } + return true; +} + +/* static */ int FileUtils::getFilePathWithSuffixBufSize(const char *const filePath, + const char *const suffix) { + return strlen(filePath) + strlen(suffix) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePathWithSuffix(const char *const filePath, + const char *const suffix, const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s%s", filePath, suffix); +} + +/* static */ int FileUtils::getFilePathBufSize(const char *const dirPath, + const char *const fileName) { + return strlen(dirPath) + 1 /* '/' */ + strlen(fileName) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s/%s", dirPath, fileName); +} + +/* static */ bool FileUtils::getFilePathWithoutSuffix(const char *const filePath, + const char *const suffix, const int outDirPathBufSize, char *const outDirPath) { + const int filePathLength = strlen(filePath); + const int suffixLength = strlen(suffix); + if (filePathLength <= suffixLength) { + AKLOGE("File path length (%s:%d) is shorter that suffix length (%s:%d).", + filePath, filePathLength, suffix, suffixLength); + return false; + } + const int resultFilePathLength = filePathLength - suffixLength; + if (outDirPathBufSize <= resultFilePathLength) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, suffix: %s, outDirPathBufSize: %d", + filePath, suffix, outDirPathBufSize); + return false; + } + if (strncmp(filePath + resultFilePathLength, suffix, suffixLength) != 0) { + AKLOGE("File Path %s does not have %s as a suffix", filePath, suffix); + return false; + } + snprintf(outDirPath, resultFilePathLength + 1 /* terminator */, "%s", filePath); + return true; +} + +/* static */ void FileUtils::getDirPath(const char *const filePath, const int outDirPathBufSize, + char *const outDirPath) { + for (int i = strlen(filePath) - 1; i >= 0; --i) { + if (filePath[i] == '/') { + if (i >= outDirPathBufSize) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, outDirPathBufSize: %d", + filePath, outDirPathBufSize); + ASSERT(false); + return; + } + snprintf(outDirPath, i + 1 /* terminator */, "%s", filePath); + return; + } + } +} + +/* static */ void FileUtils::getBasename(const char *const filePath, + const int outNameBufSize, char *const outName) { + const int filePathBufSize = strlen(filePath) + 1 /* terminator */; + char filePathBuf[filePathBufSize]; + snprintf(filePathBuf, filePathBufSize, "%s", filePath); + const char *const baseName = basename(filePathBuf); + const int baseNameLength = strlen(baseName); + if (baseNameLength >= outNameBufSize) { + AKLOGE("outNameBufSize is too small. filePath: %s, outNameBufSize: %d", + filePath, outNameBufSize); + return; + } + snprintf(outName, baseNameLength + 1 /* terminator */, "%s", baseName); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/file_utils.h b/app/src/main/jni/src/dictionary/utils/file_utils.h new file mode 100644 index 000000000..4f1b93a6a --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/file_utils.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FILE_UTILS_H +#define LATINIME_FILE_UTILS_H + +#include "defines.h" + +namespace latinime { + +class FileUtils { + public: + // Returns -1 on error. + static int getFileSize(const char *const filePath); + + static bool existsDir(const char *const dirPath); + + // Remove a directory and all files in the directory. + static bool removeDirAndFiles(const char *const dirPath); + + static int getFilePathWithSuffixBufSize(const char *const filePath, const char *const suffix); + + static void getFilePathWithSuffix(const char *const filePath, const char *const suffix, + const int filePathBufSize, char *const outFilePath); + + static int getFilePathBufSize(const char *const dirPath, const char *const fileName); + + static void getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath); + + // Returns whether the filePath have the suffix. + static bool getFilePathWithoutSuffix(const char *const filePath, const char *const suffix, + const int dirPathBufSize, char *const outDirPath); + + static void getDirPath(const char *const filePath, const int dirPathBufSize, + char *const outDirPath); + + static void getBasename(const char *const filePath, const int outNameBufSize, + char *const outName); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FileUtils); + + static bool removeDirAndFiles(const char *const dirPath, const int maxTries); +}; +} // namespace latinime +#endif /* LATINIME_FILE_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/utils/forgetting_curve_utils.cpp b/app/src/main/jni/src/dictionary/utils/forgetting_curve_utils.cpp new file mode 100644 index 000000000..d79ed911b --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/forgetting_curve_utils.cpp @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/forgetting_curve_utils.h" + +#include +#include +#include + +#include "dictionary/header/header_policy.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; +const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; + +const int ForgettingCurveUtils::MAX_LEVEL = 15; +const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2; +const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31; +const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30; +const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1; +// TODO: Evaluate whether this should be 7.5 days. +// 15 days +const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60; + +const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2; + +const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; + +// TODO: Revise the logic to decide the initial probability depending on the given probability. +/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { + const int timestamp = newHistoricalInfo->getTimestamp(); + if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { + // Add entry as a valid word. + const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); + const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); + return HistoricalInfo(timestamp, level, count); + } else if (!originalHistoricalInfo->isValid() + || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel() + || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() + && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { + // Initial information. + int count = newHistoricalInfo->getCount(); + if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) { + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1); + return HistoricalInfo(timestamp, level, 0 /* count */); + } + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); + return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy)); + } else { + const int updatedCount = originalHistoricalInfo->getCount() + 1; + if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) { + // The count exceeds the max value the level can be incremented. + if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { + // The level is already max. + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); + } else { + // Raise the level. + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel() + 1, 0 /* count */); + } + } else { + return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); + } + } +} + +/* static */ int ForgettingCurveUtils::decodeProbability( + const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { + const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS); + return sProbabilityTable.getProbability( + headerPolicy->getForgettingCurveProbabilityValuesTableId(), + clampToValidLevelRange(historicalInfo->getLevel()), + clampToValidTimeStepCountRange(elapsedTimeStepCount)); +} + +/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy) { + return historicalInfo->getLevel() > 0 + || getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS) + < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; +} + +/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy) { + if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) { + return HistoricalInfo(); + } + const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; + const int elapsedTimeStep = getElapsedTimeStepCount( + originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds); + if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { + // No need to update historical info. + return *originalHistoricalInfo; + } + // Lower the level. + const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? + originalHistoricalInfo->getLevel() : maxLevelDownAmonut; + const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() + + levelDownAmount * durationToLevelDownInSeconds; + return HistoricalInfo(adjustedTimestampInSeconds, + originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); +} + +/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, + const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) { + const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts(); + for (const auto ngramType : AllNgramTypes::ASCENDING) { + if (entryCounts.getNgramCount(ngramType) + >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) { + // Unigram count exceeds the limit. + return true; + } + } + if (mindsBlockByDecay) { + return false; + } + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS + < TimeKeeper::peekCurrentTime()) { + // Time to decay. + return true; + } + return false; +} + +// See comments in ProbabilityUtils::backoff(). +/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { + // See TODO comments in ForgettingCurveUtils::getProbability(). + return unigramProbability; +} + +/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp, + const int durationToLevelDownInSeconds) { + const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp; + const int timeStepDurationInSeconds = + durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + return elapsedTimeInSeconds / timeStepDurationInSeconds; +} + +/* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) { + return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, + const HeaderPolicy *const headerPolicy) { + return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1); +} + +/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { + return std::min(std::max(level, 0), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) { + return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT); +} + +const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10; + + +ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { + mTables.resize(PROBABILITY_TABLE_COUNT); + for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) { + mTables[tableId].resize(MAX_LEVEL + 1); + for (int level = 0; level <= MAX_LEVEL; ++level) { + mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); + const float initialProbability = getBaseProbabilityForLevel(tableId, level); + const float endProbability = getBaseProbabilityForLevel(tableId, level - 1); + for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; + ++timeStepCount) { + if (level < MIN_VISIBLE_LEVEL) { + mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; + continue; + } + const float probability = initialProbability + * powf(initialProbability / endProbability, + -1.0f * static_cast(timeStepCount) + / static_cast(MAX_ELAPSED_TIME_STEP_COUNT + 1)); + mTables[tableId][level][timeStepCount] = + std::min(std::max(static_cast(probability), 1), MAX_PROBABILITY); + } + } + } +} + +/* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel( + const int tableId, const int level) { + if (tableId == WEAK_PROBABILITY_TABLE_ID) { + // Max probability is 127. + return static_cast(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level))); + } else if (tableId == MODEST_PROBABILITY_TABLE_ID) { + // Max probability is 128. + return static_cast(MODEST_BASE_PROBABILITY * (level + 1)); + } else if (tableId == STRONG_PROBABILITY_TABLE_ID) { + // Max probability is 140. + return static_cast(STRONG_BASE_PROBABILITY * (level + 1)); + } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) { + // Max probability is 160. + return static_cast(AGGRESSIVE_BASE_PROBABILITY * (level + 1)); + } else { + return NOT_A_PROBABILITY; + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/forgetting_curve_utils.h b/app/src/main/jni/src/dictionary/utils/forgetting_curve_utils.h new file mode 100644 index 000000000..ddaac7e3b --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/forgetting_curve_utils.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORGETTING_CURVE_UTILS_H +#define LATINIME_FORGETTING_CURVE_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/utils/entry_counters.h" + +namespace latinime { + +class HeaderPolicy; + +class ForgettingCurveUtils { + public: + static const HistoricalInfo createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy); + + static const HistoricalInfo createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy); + + static int decodeProbability(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); + + static bool needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); + + static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters, + const HeaderPolicy *const headerPolicy); + + // TODO: Improve probability computation method and remove this. + static int getProbabilityBiasForNgram(const int n) { + return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE; + } + + AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) { + return static_cast(static_cast(maxEntryCount) + * ENTRY_COUNT_HARD_LIMIT_WEIGHT); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); + + class ProbabilityTable { + public: + ProbabilityTable(); + + int getProbability(const int tableId, const int level, + const int elapsedTimeStepCount) const { + return mTables[tableId][level][elapsedTimeStepCount]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); + + static const int PROBABILITY_TABLE_COUNT; + static const int WEAK_PROBABILITY_TABLE_ID; + static const int MODEST_PROBABILITY_TABLE_ID; + static const int STRONG_PROBABILITY_TABLE_ID; + static const int AGGRESSIVE_PROBABILITY_TABLE_ID; + + static const int WEAK_MAX_PROBABILITY; + static const int MODEST_BASE_PROBABILITY; + static const int STRONG_BASE_PROBABILITY; + static const int AGGRESSIVE_BASE_PROBABILITY; + + std::vector>> mTables; + + static int getBaseProbabilityForLevel(const int tableId, const int level); + }; + + static const int MULTIPLIER_TWO_IN_PROBABILITY_SCALE; + static const int DECAY_INTERVAL_SECONDS; + + static const int MAX_LEVEL; + static const int MIN_VISIBLE_LEVEL; + static const int MAX_ELAPSED_TIME_STEP_COUNT; + static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; + static const int OCCURRENCES_TO_RAISE_THE_LEVEL; + static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; + + static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT; + + static const ProbabilityTable sProbabilityTable; + + static int backoff(const int unigramProbability); + static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown); + static int clampToVisibleEntryLevelRange(const int level); + static int clampToValidLevelRange(const int level); + static int clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy); + static int clampToValidTimeStepCountRange(const int timeStepCount); +}; +} // namespace latinime +#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/utils/format_utils.cpp b/app/src/main/jni/src/dictionary/utils/format_utils.cpp new file mode 100644 index 000000000..cef3b094c --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/format_utils.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/format_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; + +// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 +const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; + +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { + switch (formatVersion) { + case VERSION_2: + case VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return UNKNOWN_VERSION; + case VERSION_202: + return VERSION_202; + case VERSION_4_ONLY_FOR_TESTING: + return VERSION_4_ONLY_FOR_TESTING; + case VERSION_402: + return VERSION_402; + case VERSION_403: + return VERSION_403; + default: + return UNKNOWN_VERSION; + } +} +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( + const ReadOnlyByteArrayView dictBuffer) { + // The magic number is stored big-endian. + // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't + // understand this format. + if (dictBuffer.size() < DICTIONARY_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + const uint32_t magicNumber = ByteArrayUtils::readUint32(dictBuffer.data(), 0); + switch (magicNumber) { + case MAGIC_NUMBER: + // The layout of the header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Dictionary format version number (2 bytes) + // Options (2 bytes) + // Header size (4 bytes) : integer, big endian + // Conceptually this converts the hardcoded value of the bytes in the file into + // the symbolic value we use in the code. But we want the constants to be the + // same so we use them for both here. + return getFormatVersion(ByteArrayUtils::readUint16(dictBuffer.data(), 4)); + default: + return UNKNOWN_VERSION; + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/format_utils.h b/app/src/main/jni/src/dictionary/utils/format_utils.h new file mode 100644 index 000000000..1616efcce --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/format_utils.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORMAT_UTILS_H +#define LATINIME_FORMAT_UTILS_H + +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/** + * Methods to handle binary dictionary format version. + */ +class FormatUtils { + public: + enum FORMAT_VERSION { + // These MUST have the same values as the relevant constants in FormatSpec.java. + // TODO: Remove VERSION_2 and VERSION_201 when we: + // * Confirm that old versions of LatinIME download old-format dictionaries + // * We no longer need the corresponding constants on the Java side for dicttool + VERSION_2 = 2, + VERSION_201 = 201, + VERSION_202 = 202, + VERSION_4_ONLY_FOR_TESTING = 399, + VERSION_402 = 402, + VERSION_403 = 403, + UNKNOWN_VERSION = -1 + }; + + // 32 bit magic number is stored at the beginning of the dictionary header to reject + // unsupported or obsolete dictionary formats. + static const uint32_t MAGIC_NUMBER; + + static FORMAT_VERSION getFormatVersion(const int formatVersion); + static FORMAT_VERSION detectFormatVersion(const ReadOnlyByteArrayView dictBuffer); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); + + static const size_t DICTIONARY_MINIMUM_SIZE; +}; +} // namespace latinime +#endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/utils/mmapped_buffer.cpp b/app/src/main/jni/src/dictionary/utils/mmapped_buffer.cpp new file mode 100644 index 000000000..c5259de6d --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/mmapped_buffer.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/mmapped_buffer.h" + +#include +#include +#include +#include +#include +#include + +#include "dictionary/utils/file_utils.h" + +namespace latinime { + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const int bufferOffset, const int bufferSize, + const bool isUpdatable) { + const int mmapFd = open(path, O_RDONLY); + if (mmapFd < 0) { + AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); + return nullptr; + } + const int pagesize = sysconf(_SC_PAGESIZE); + const int offset = bufferOffset % pagesize; + int alignedOffset = bufferOffset - offset; + int alignedSize = bufferSize + offset; + const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; + void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, + alignedOffset); + if (mmappedBuffer == MAP_FAILED) { + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + close(mmapFd); + return nullptr; + } + uint8_t *const buffer = static_cast(mmappedBuffer) + offset; + if (!buffer) { + AKLOGE("DICT: buffer is null"); + close(mmapFd); + return nullptr; + } + return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, + mmapFd, isUpdatable)); +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const bool isUpdatable) { + const int fileSize = FileUtils::getFileSize(path); + if (fileSize == -1) { + return nullptr; + } else if (fileSize == 0) { + return MmappedBufferPtr(new MmappedBuffer(isUpdatable)); + } else { + return openBuffer(path, 0 /* bufferOffset */, fileSize, isUpdatable); + } +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const dirPath, const char *const fileName, const bool isUpdatable) { + const int filePathBufferSize = PATH_MAX + 1 /* terminator */; + char filePath[filePathBufferSize]; + const int filePathLength = snprintf(filePath, filePathBufferSize, "%s%s", dirPath, + fileName); + if (filePathLength >= filePathBufferSize) { + return nullptr; + } + return openBuffer(filePath, isUpdatable); +} + +MmappedBuffer::~MmappedBuffer() { + if (mAlignedSize == 0) { + return; + } + int ret = munmap(mMmappedBuffer, mAlignedSize); + if (ret != 0) { + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + } + ret = close(mMmapFd); + if (ret != 0) { + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/mmapped_buffer.h b/app/src/main/jni/src/dictionary/utils/mmapped_buffer.h new file mode 100644 index 000000000..e25310373 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/mmapped_buffer.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MMAPPED_BUFFER_H +#define LATINIME_MMAPPED_BUFFER_H + +#include +#include + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class MmappedBuffer { + public: + typedef std::unique_ptr MmappedBufferPtr; + + static MmappedBufferPtr openBuffer(const char *const path, + const int bufferOffset, const int bufferSize, const bool isUpdatable); + + // Mmap entire file. + static MmappedBufferPtr openBuffer(const char *const path, const bool isUpdatable); + + static MmappedBufferPtr openBuffer(const char *const dirPath, const char *const fileName, + const bool isUpdatable); + + ~MmappedBuffer(); + + ReadWriteByteArrayView getReadWriteByteArrayView() const { + return mByteArrayView; + } + + ReadOnlyByteArrayView getReadOnlyByteArrayView() const { + return mByteArrayView.getReadOnlyView(); + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + private: + AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize, + void *const mmappedBuffer, const int alignedSize, const int mmapFd, + const bool isUpdatable) + : mByteArrayView(buffer, bufferSize), mMmappedBuffer(mmappedBuffer), + mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} + + // Empty file. We have to handle an empty file as a valid part of a dictionary. + AK_FORCE_INLINE MmappedBuffer(const bool isUpdatable) + : mByteArrayView(), mMmappedBuffer(nullptr), mAlignedSize(0), + mMmapFd(0), mIsUpdatable(isUpdatable) {} + + DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); + + const ReadWriteByteArrayView mByteArrayView; + void *const mMmappedBuffer; + const int mAlignedSize; + const int mMmapFd; + const bool mIsUpdatable; +}; +} +#endif /* LATINIME_MMAPPED_BUFFER_H */ diff --git a/app/src/main/jni/src/dictionary/utils/multi_bigram_map.cpp b/app/src/main/jni/src/dictionary/utils/multi_bigram_map.cpp new file mode 100644 index 000000000..e730fff8e --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/multi_bigram_map.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/multi_bigram_map.h" + +#include +#include + +namespace latinime { + +// Max number of bigram maps (previous word contexts) to be cached. Increasing this number +// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory +// usage. Also, there are diminishing returns since the most frequently used bigrams are +// typically near the beginning of the input and are thus the first ones to be cached. Note +// that these bigrams are reset for each new composing word. +const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; + +// Most common previous word contexts currently have 100 bigrams +const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; + +// Look up the bigram probability for the given word pair from the cached bigram maps. +// Also caches the bigrams if there is space remaining and they have not been cached already. +int MultiBigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, + const int unigramProbability) { + if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) { + return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); + } + const auto mapPosition = mBigramMaps.find(prevWordIds[0]); + if (mapPosition != mBigramMaps.end()) { + return mapPosition->second.getBigramProbability(structurePolicy, nextWordId, + unigramProbability); + } + if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { + addBigramsForWord(structurePolicy, prevWordIds); + return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy, + nextWordId, unigramProbability); + } + return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds, + nextWordId, unigramProbability); +} + +void MultiBigramMap::BigramMap::init( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds) { + structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */); +} + +int MultiBigramMap::BigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordId, const int unigramProbability) const { + int bigramProbability = NOT_A_PROBABILITY; + if (mBloomFilter.isInFilter(nextWordId)) { + const auto bigramProbabilityIt = mBigramMap.find(nextWordId); + if (bigramProbabilityIt != mBigramMap.end()) { + bigramProbability = bigramProbabilityIt->second; + } + } + return structurePolicy->getProbability(unigramProbability, bigramProbability); +} + +void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) { + if (targetWordId == NOT_A_WORD_ID) { + return; + } + mBigramMap[targetWordId] = ngramProbability; + mBloomFilter.setInFilter(targetWordId); +} + +void MultiBigramMap::addBigramsForWord( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds) { + mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds); +} + +int MultiBigramMap::readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) { + const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId); + if (bigramProbability != NOT_A_PROBABILITY) { + return bigramProbability; + } + return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/multi_bigram_map.h b/app/src/main/jni/src/dictionary/utils/multi_bigram_map.h new file mode 100644 index 000000000..6f23d98bc --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/multi_bigram_map.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MULTI_BIGRAM_MAP_H +#define LATINIME_MULTI_BIGRAM_MAP_H + +#include +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/bloom_filter.h" +#include "utils/int_array_view.h" + +namespace latinime { + +// Class for caching bigram maps for multiple previous word contexts. This is useful since the +// algorithm needs to look up the set of bigrams for every word pair that occurs in every +// multi-word suggestion. +class MultiBigramMap { + public: + MultiBigramMap() : mBigramMaps() {} + ~MultiBigramMap() {} + + // Look up the bigram probability for the given word pair from the cached bigram maps. + // Also caches the bigrams if there is space remaining and they have not been cached already. + int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); + + void clear() { + mBigramMaps.clear(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(MultiBigramMap); + + class BigramMap : public NgramListener { + public: + BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} + // Copy constructor needed for std::unordered_map. + BigramMap(const BigramMap &bigramMap) + : mBigramMap(bigramMap.mBigramMap), mBloomFilter(bigramMap.mBloomFilter) {} + virtual ~BigramMap() {} + + void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds); + int getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordId, const int unigramProbability) const; + virtual void onVisitEntry(const int ngramProbability, const int targetWordId); + + private: + static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; + std::unordered_map mBigramMap; + BloomFilter mBloomFilter; + }; + + void addBigramsForWord(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds); + + int readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); + + static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; + std::unordered_map mBigramMaps; +}; +} // namespace latinime +#endif // LATINIME_MULTI_BIGRAM_MAP_H diff --git a/app/src/main/jni/src/dictionary/utils/probability_utils.cpp b/app/src/main/jni/src/dictionary/utils/probability_utils.cpp new file mode 100644 index 000000000..426a0e783 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/probability_utils.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/probability_utils.h" + +namespace latinime { + +const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f; + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/probability_utils.h b/app/src/main/jni/src/dictionary/utils/probability_utils.h new file mode 100644 index 000000000..2050af1e9 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/probability_utils.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_UTILS_H +#define LATINIME_PROBABILITY_UTILS_H + +#include +#include + +#include "defines.h" + +namespace latinime { + +// TODO: Quit using bigram probability to indicate the delta. +class ProbabilityUtils { + public: + static AK_FORCE_INLINE int backoff(const int unigramProbability) { + return unigramProbability; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramProbability > 8 ? + // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); + } + + static AK_FORCE_INLINE int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want + // the unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictEncoder#makeBigramFlags for details. + const float stepSize = static_cast(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast(static_cast(bigramProbability + 1) * stepSize); + } + + // Encode probability using the same way as we are doing for main dictionaries. + static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) { + const float probability = static_cast(MAX_PROBABILITY) + + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER; + if (probability < 0.0f) { + return 0; + } + return std::min(static_cast(probability + 0.5f), MAX_PROBABILITY); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); + + static const float PROBABILITY_ENCODING_SCALER; +}; +} +#endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/app/src/main/jni/src/dictionary/utils/sparse_table.cpp b/app/src/main/jni/src/dictionary/utils/sparse_table.cpp new file mode 100644 index 000000000..029329fab --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/sparse_table.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/sparse_table.h" + +namespace latinime { + +const int SparseTable::NOT_EXIST = -1; +const int SparseTable::INDEX_SIZE = 4; + +bool SparseTable::contains(const int id) const { + const int readingPos = getPosInIndexTable(id); + if (id < 0 || mIndexTableBuffer->getTailPosition() <= readingPos) { + return false; + } + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, readingPos); + return index != NOT_EXIST; +} + +uint32_t SparseTable::get(const int id) const { + const int indexTableReadingPos = getPosInIndexTable(id); + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, indexTableReadingPos); + const int contentTableReadingPos = getPosInContentTable(id, index); + if (contentTableReadingPos < 0 + || contentTableReadingPos >= mContentTableBuffer->getTailPosition()) { + AKLOGE("contentTableReadingPos(%d) is invalid. id: %d, index: %d", + contentTableReadingPos, id, index); + return NOT_A_DICT_POS; + } + const int contentValue = mContentTableBuffer->readUint(mDataSize, contentTableReadingPos); + return contentValue == NOT_EXIST ? NOT_A_DICT_POS : contentValue; +} + +bool SparseTable::set(const int id, const uint32_t value) { + const int posInIndexTable = getPosInIndexTable(id); + // Extends the index table if needed. + int tailPos = mIndexTableBuffer->getTailPosition(); + while (tailPos <= posInIndexTable) { + if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) { + AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable); + return false; + } + } + if (contains(id)) { + // The entry is already in the content table. + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable); + if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) { + AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value, + getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(), + mDataSize); + return false; + } + return true; + } + // The entry is not in the content table. + // Create new entry in the content table. + const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition()); + if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) { + AKLOGE("cannot write index %d. pos %d", index, posInIndexTable); + return false; + } + // Write a new block that containing the entry to be set. + int writingPos = getPosInContentTable(0 /* id */, index); + for (int i = 0; i < mBlockSize; ++i) { + if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, mDataSize, + &writingPos)) { + AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, " + "mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize); + return false; + } + } + return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index)); +} + +int SparseTable::getIndexFromContentTablePos(const int contentTablePos) const { + return contentTablePos / mDataSize / mBlockSize; +} + +int SparseTable::getPosInIndexTable(const int id) const { + return (id / mBlockSize) * INDEX_SIZE; +} + +int SparseTable::getPosInContentTable(const int id, const int index) const { + const int offset = id % mBlockSize; + return (index * mBlockSize + offset) * mDataSize; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/sparse_table.h b/app/src/main/jni/src/dictionary/utils/sparse_table.h new file mode 100644 index 000000000..bd1190e8b --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/sparse_table.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SPARSE_TABLE_H +#define LATINIME_SPARSE_TABLE_H + +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// TODO: Support multiple content buffers. +class SparseTable { + public: + SparseTable(BufferWithExtendableBuffer *const indexTableBuffer, + BufferWithExtendableBuffer *const contentTableBuffer, const int blockSize, + const int dataSize) + : mIndexTableBuffer(indexTableBuffer), mContentTableBuffer(contentTableBuffer), + mBlockSize(blockSize), mDataSize(dataSize) {} + + bool contains(const int id) const; + + uint32_t get(const int id) const; + + bool set(const int id, const uint32_t value); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTable); + + int getIndexFromContentTablePos(const int contentTablePos) const; + + int getPosInIndexTable(const int id) const; + + int getPosInContentTable(const int id, const int index) const; + + static const int NOT_EXIST; + static const int INDEX_SIZE; + + BufferWithExtendableBuffer *const mIndexTableBuffer; + BufferWithExtendableBuffer *const mContentTableBuffer; + const int mBlockSize; + const int mDataSize; +}; +} // namespace latinime +#endif /* LATINIME_SPARSE_TABLE_H */ diff --git a/app/src/main/jni/src/dictionary/utils/trie_map.cpp b/app/src/main/jni/src/dictionary/utils/trie_map.cpp new file mode 100644 index 000000000..0bef8c702 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/trie_map.cpp @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/trie_map.h" + +#include "dictionary/utils/dict_file_writing_utils.h" + +namespace latinime { + +const int TrieMap::INVALID_INDEX = -1; +const int TrieMap::FIELD0_SIZE = 4; +const int TrieMap::FIELD1_SIZE = 3; +const int TrieMap::ENTRY_SIZE = FIELD0_SIZE + FIELD1_SIZE; +const uint32_t TrieMap::VALUE_FLAG = 0x400000; +const uint32_t TrieMap::VALUE_MASK = 0x3FFFFF; +const uint32_t TrieMap::INVALID_VALUE_IN_KEY_VALUE_ENTRY = VALUE_MASK; +const uint32_t TrieMap::TERMINAL_LINK_FLAG = 0x800000; +const uint32_t TrieMap::TERMINAL_LINK_MASK = 0x7FFFFF; +const int TrieMap::NUM_OF_BITS_USED_FOR_ONE_LEVEL = 5; +const uint32_t TrieMap::LABEL_MASK = 0x1F; +const int TrieMap::MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL = 1 << NUM_OF_BITS_USED_FOR_ONE_LEVEL; +const int TrieMap::ROOT_BITMAP_ENTRY_INDEX = 0; +const int TrieMap::ROOT_BITMAP_ENTRY_POS = MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL * FIELD0_SIZE; +const TrieMap::Entry TrieMap::EMPTY_BITMAP_ENTRY = TrieMap::Entry(0, 0); +const int TrieMap::TERMINAL_LINKED_ENTRY_COUNT = 2; // Value entry and bitmap entry. +const uint64_t TrieMap::MAX_VALUE = + (static_cast(1) << ((FIELD0_SIZE + FIELD1_SIZE) * CHAR_BIT)) - 1; +const int TrieMap::MAX_BUFFER_SIZE = TERMINAL_LINK_MASK * ENTRY_SIZE; + +TrieMap::TrieMap() : mBuffer(MAX_BUFFER_SIZE) { + mBuffer.extend(ROOT_BITMAP_ENTRY_POS); + writeEntry(EMPTY_BITMAP_ENTRY, ROOT_BITMAP_ENTRY_INDEX); +} + +TrieMap::TrieMap(const ReadWriteByteArrayView buffer) + : mBuffer(buffer, BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} + +void TrieMap::dump(const int from, const int to) const { + AKLOGI("BufSize: %d", mBuffer.getTailPosition()); + for (int i = from; i < to; ++i) { + AKLOGI("Entry[%d]: %x, %x", i, readField0(i), readField1(i)); + } + int unusedRegionSize = 0; + for (int i = 1; i <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; ++i) { + int index = readEmptyTableLink(i); + while (index != ROOT_BITMAP_ENTRY_INDEX) { + index = readField0(index); + unusedRegionSize += i; + } + } + AKLOGI("Unused Size: %d", unusedRegionSize); +} + +int TrieMap::getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex) { + const Entry bitmapEntry = readEntry(bitmapEntryIndex); + const uint32_t unsignedKey = static_cast(key); + const int terminalEntryIndex = getTerminalEntryIndex( + unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return INVALID_INDEX; + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (terminalEntry.hasTerminalLink()) { + return terminalEntry.getValueEntryIndex() + 1; + } + // Create a value entry and a bitmap entry. + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return INVALID_INDEX; + } + if (!writeEntry(Entry(0, terminalEntry.getValue()), valueEntryIndex)) { + return INVALID_INDEX; + } + if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { + return INVALID_INDEX; + } + if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex)) { + return INVALID_INDEX; + } + return valueEntryIndex + 1; +} + +const TrieMap::Result TrieMap::get(const int key, const int bitmapEntryIndex) const { + const uint32_t unsignedKey = static_cast(key); + return getInternal(unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntryIndex, + 0 /* level */); +} + +bool TrieMap::put(const int key, const uint64_t value, const int bitmapEntryIndex) { + if (value > MAX_VALUE) { + return false; + } + const uint32_t unsignedKey = static_cast(key); + return putInternal(unsignedKey, value, getBitShuffledKey(unsignedKey), bitmapEntryIndex, + readEntry(bitmapEntryIndex), 0 /* level */); +} + +bool TrieMap::save(FILE *const file) const { + return DictFileWritingUtils::writeBufferToFileTail(file, &mBuffer); +} + +bool TrieMap::remove(const int key, const int bitmapEntryIndex) { + const Entry bitmapEntry = readEntry(bitmapEntryIndex); + const uint32_t unsignedKey = static_cast(key); + const int terminalEntryIndex = getTerminalEntryIndex( + unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return false; + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , terminalEntryIndex)) { + return false; + } + if (terminalEntry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(terminalEntry.getValueEntryIndex() + 1); + if (!freeTable(terminalEntry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)){ + return false; + } + } + return true; +} + +/** + * Iterate next entry in a certain level. + * + * @param iterationState the iteration state that will be read and updated in this method. + * @param outKey the output key + * @return Result instance. mIsValid is false when all entries are iterated. + */ +const TrieMap::Result TrieMap::iterateNext(std::vector *const iterationState, + int *const outKey) const { + while (!iterationState->empty()) { + TableIterationState &state = iterationState->back(); + if (state.mTableSize <= state.mCurrentIndex) { + // Move to parent. + iterationState->pop_back(); + } else { + const int entryIndex = state.mTableIndex + state.mCurrentIndex; + state.mCurrentIndex += 1; + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Move to child. + iterationState->emplace_back(popCount(entry.getBitmap()), entry.getTableIndex()); + } else if (entry.isValidTerminalEntry()) { + if (outKey) { + *outKey = entry.getKey(); + } + if (!entry.hasTerminalLink()) { + return Result(entry.getValue(), true, INVALID_INDEX); + } + const int valueEntryIndex = entry.getValueEntryIndex(); + const Entry valueEntry = readEntry(valueEntryIndex); + return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); + } + } + } + // Visited all entries. + return Result(0, false, INVALID_INDEX); +} + +/** + * Shuffle bits of the key in the fixed order. + * + * This method is used as a hash function. This returns different values for different inputs. + */ +uint32_t TrieMap::getBitShuffledKey(const uint32_t key) const { + uint32_t shuffledKey = 0; + for (int i = 0; i < 4; ++i) { + const uint32_t keyPiece = (key >> (i * 8)) & 0xFF; + shuffledKey ^= ((keyPiece ^ (keyPiece << 7) ^ (keyPiece << 14) ^ (keyPiece << 21)) + & 0x11111111) << i; + } + return shuffledKey; +} + +bool TrieMap::writeValue(const uint64_t value, const int terminalEntryIndex) { + if (value < VALUE_MASK) { + // Write value into the terminal entry. + return writeField1(value | VALUE_FLAG, terminalEntryIndex); + } + // Create value entry and write value. + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return false; + } + if (!writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex)) { + return false; + } + if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { + return false; + } + return writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex); +} + +bool TrieMap::updateValue(const Entry &terminalEntry, const uint64_t value, + const int terminalEntryIndex) { + if (!terminalEntry.hasTerminalLink()) { + return writeValue(value, terminalEntryIndex); + } + const int valueEntryIndex = terminalEntry.getValueEntryIndex(); + return writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex); +} + +bool TrieMap::freeTable(const int tableIndex, const int entryCount) { + if (!writeField0(readEmptyTableLink(entryCount), tableIndex)) { + return false; + } + return writeEmptyTableLink(tableIndex, entryCount); +} + +/** + * Allocate table with entryCount-entries. Reuse freed table if possible. + */ +int TrieMap::allocateTable(const int entryCount) { + if (entryCount > 0 && entryCount <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL) { + const int tableIndex = readEmptyTableLink(entryCount); + if (tableIndex > 0) { + if (!writeEmptyTableLink(readField0(tableIndex), entryCount)) { + return INVALID_INDEX; + } + // Reuse the table. + return tableIndex; + } + } + // Allocate memory space at tail position of the buffer. + const int mapIndex = getTailEntryIndex(); + if (!mBuffer.extend(entryCount * ENTRY_SIZE)) { + return INVALID_INDEX; + } + return mapIndex; +} + +int TrieMap::getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, + const Entry &bitmapEntry, const int level) const { + const int label = getLabel(hashedKey, level); + if (!exists(bitmapEntry.getBitmap(), label)) { + return INVALID_INDEX; + } + const int entryIndex = bitmapEntry.getTableIndex() + popCount(bitmapEntry.getBitmap(), label); + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Move to the next level. + return getTerminalEntryIndex(key, hashedKey, entry, level + 1); + } + if (!entry.isValidTerminalEntry()) { + return INVALID_INDEX; + } + if (entry.getKey() == key) { + // Terminal entry is found. + return entryIndex; + } + return INVALID_INDEX; +} + +/** + * Get Result corresponding to the key. + * + * @param key the key. + * @param hashedKey the hashed key. + * @param bitmapEntryIndex the index of bitmap entry + * @param level current level + * @return Result instance corresponding to the key. mIsValid indicates whether the key is in the + * map. + */ +const TrieMap::Result TrieMap::getInternal(const uint32_t key, const uint32_t hashedKey, + const int bitmapEntryIndex, const int level) const { + const int terminalEntryIndex = getTerminalEntryIndex(key, hashedKey, + readEntry(bitmapEntryIndex), level); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return Result(0, false, INVALID_INDEX); + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (!terminalEntry.hasTerminalLink()) { + return Result(terminalEntry.getValue(), true, INVALID_INDEX); + } + const int valueEntryIndex = terminalEntry.getValueEntryIndex(); + const Entry valueEntry = readEntry(valueEntryIndex); + return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); +} + +/** + * Put key to value mapping to the map. + * + * @param key the key. + * @param value the value + * @param hashedKey the hashed key. + * @param bitmapEntryIndex the index of bitmap entry + * @param bitmapEntry the bitmap entry + * @param level current level + * @return whether the key-value has been correctly inserted to the map or not. + */ +bool TrieMap::putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, + const int bitmapEntryIndex, const Entry &bitmapEntry, const int level) { + const int label = getLabel(hashedKey, level); + const uint32_t bitmap = bitmapEntry.getBitmap(); + const int mapIndex = bitmapEntry.getTableIndex(); + if (!exists(bitmap, label)) { + // Current map doesn't contain the label. + return addNewEntryByExpandingTable(key, value, mapIndex, bitmap, bitmapEntryIndex, label); + } + const int entryIndex = mapIndex + popCount(bitmap, label); + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Bitmap entry is found. Go to the next level. + return putInternal(key, value, hashedKey, entryIndex, entry, level + 1); + } + if (!entry.isValidTerminalEntry()) { + // Overwrite invalid terminal entry. + return writeTerminalEntry(key, value, entryIndex); + } + if (entry.getKey() == key) { + // Terminal entry for the key is found. Update the value. + return updateValue(entry, value, entryIndex); + } + // Conflict with the existing key. + return addNewEntryByResolvingConflict(key, value, hashedKey, entry, entryIndex, level); +} + +/** + * Resolve a conflict in the current level and add new entry. + * + * @param key the key + * @param value the value + * @param hashedKey the hashed key + * @param conflictedEntry the existing conflicted entry + * @param conflictedEntryIndex the index of existing conflicted entry + * @param level current level + * @return whether the key-value has been correctly inserted to the map or not. + */ +bool TrieMap::addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, + const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, + const int level) { + const int conflictedKeyNextLabel = + getLabel(getBitShuffledKey(conflictedEntry.getKey()), level + 1); + const int nextLabel = getLabel(hashedKey, level + 1); + if (conflictedKeyNextLabel == nextLabel) { + // Conflicted again in the next level. + const int newTableIndex = allocateTable(1 /* entryCount */); + if (newTableIndex == INVALID_INDEX) { + return false; + } + if (!writeEntry(conflictedEntry, newTableIndex)) { + return false; + } + const Entry newBitmapEntry(setExist(0 /* bitmap */, nextLabel), newTableIndex); + if (!writeEntry(newBitmapEntry, conflictedEntryIndex)) { + return false; + } + return putInternal(key, value, hashedKey, conflictedEntryIndex, newBitmapEntry, level + 1); + } + // The conflict has been resolved. Create a table that contains 2 entries. + const int newTableIndex = allocateTable(2 /* entryCount */); + if (newTableIndex == INVALID_INDEX) { + return false; + } + if (nextLabel < conflictedKeyNextLabel) { + if (!writeTerminalEntry(key, value, newTableIndex)) { + return false; + } + if (!writeEntry(conflictedEntry, newTableIndex + 1)) { + return false; + } + } else { // nextLabel > conflictedKeyNextLabel + if (!writeEntry(conflictedEntry, newTableIndex)) { + return false; + } + if (!writeTerminalEntry(key, value, newTableIndex + 1)) { + return false; + } + } + const uint32_t updatedBitmap = + setExist(setExist(0 /* bitmap */, nextLabel), conflictedKeyNextLabel); + return writeEntry(Entry(updatedBitmap, newTableIndex), conflictedEntryIndex); +} + +/** + * Add new entry to the existing table. + */ +bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, + const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, const int label) { + // Current map doesn't contain the label. + const int entryCount = popCount(bitmap); + const int newTableIndex = allocateTable(entryCount + 1); + if (newTableIndex == INVALID_INDEX) { + return false; + } + const int newEntryIndexInTable = popCount(bitmap, label); + // Copy from existing table to the new table. + for (int i = 0; i < entryCount; ++i) { + if (!copyEntry(tableIndex + i, newTableIndex + i + (i >= newEntryIndexInTable ? 1 : 0))) { + return false; + } + } + // Write new terminal entry. + if (!writeTerminalEntry(key, value, newTableIndex + newEntryIndexInTable)) { + return false; + } + // Update bitmap. + if (!writeEntry(Entry(setExist(bitmap, label), newTableIndex), bitmapEntryIndex)) { + return false; + } + if (entryCount > 0) { + return freeTable(tableIndex, entryCount); + } + return true; +} + +bool TrieMap::removeInner(const Entry &bitmapEntry) { + const int tableSize = popCount(bitmapEntry.getBitmap()); + if (tableSize <= 0) { + // The table is empty. No need to remove any entries. + return true; + } + for (int i = 0; i < tableSize; ++i) { + const int entryIndex = bitmapEntry.getTableIndex() + i; + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Delete next bitmap entry recursively. + if (!removeInner(entry)) { + return false; + } + } else { + // Invalidate terminal entry just in case. + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , entryIndex)) { + return false; + } + if (entry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(entry.getValueEntryIndex() + 1); + if (!freeTable(entry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)) { + return false; + } + } + } + } + return true; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/dictionary/utils/trie_map.h b/app/src/main/jni/src/dictionary/utils/trie_map.h new file mode 100644 index 000000000..5fc6c2690 --- /dev/null +++ b/app/src/main/jni/src/dictionary/utils/trie_map.h @@ -0,0 +1,399 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TRIE_MAP_H +#define LATINIME_TRIE_MAP_H + +#include +#include +#include +#include + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/** + * Trie map derived from Phil Bagwell's Hash Array Mapped Trie. + * key is int and value is uint64_t. + * This supports multiple level map. Terminal entries can have a bitmap for the next level map. + * This doesn't support root map resizing. + */ +class TrieMap { + public: + struct Result { + const uint64_t mValue; + const bool mIsValid; + const int mNextLevelBitmapEntryIndex; + + Result(const uint64_t value, const bool isValid, const int nextLevelBitmapEntryIndex) + : mValue(value), mIsValid(isValid), + mNextLevelBitmapEntryIndex(nextLevelBitmapEntryIndex) {} + }; + + /** + * Struct to record iteration state in a table. + */ + struct TableIterationState { + int mTableSize; + int mTableIndex; + int mCurrentIndex; + + TableIterationState(const int tableSize, const int tableIndex) + : mTableSize(tableSize), mTableIndex(tableIndex), mCurrentIndex(0) {} + }; + + class TrieMapRange; + class TrieMapIterator { + public: + class IterationResult { + public: + IterationResult(const TrieMap *const trieMap, const int key, const uint64_t value, + const int nextLeveBitmapEntryIndex) + : mTrieMap(trieMap), mKey(key), mValue(value), + mNextLevelBitmapEntryIndex(nextLeveBitmapEntryIndex) {} + + const TrieMapRange getEntriesInNextLevel() const { + return TrieMapRange(mTrieMap, mNextLevelBitmapEntryIndex); + } + + bool hasNextLevelMap() const { + return mNextLevelBitmapEntryIndex != INVALID_INDEX; + } + + AK_FORCE_INLINE int key() const { + return mKey; + } + + AK_FORCE_INLINE uint64_t value() const { + return mValue; + } + + AK_FORCE_INLINE int getNextLevelBitmapEntryIndex() const { + return mNextLevelBitmapEntryIndex; + } + + private: + const TrieMap *const mTrieMap; + const int mKey; + const uint64_t mValue; + const int mNextLevelBitmapEntryIndex; + }; + + TrieMapIterator(const TrieMap *const trieMap, const int bitmapEntryIndex) + : mTrieMap(trieMap), mStateStack(), mBaseBitmapEntryIndex(bitmapEntryIndex), + mKey(0), mValue(0), mIsValid(false), mNextLevelBitmapEntryIndex(INVALID_INDEX) { + if (!trieMap || mBaseBitmapEntryIndex == INVALID_INDEX) { + return; + } + const Entry bitmapEntry = mTrieMap->readEntry(mBaseBitmapEntryIndex); + mStateStack.emplace_back( + mTrieMap->popCount(bitmapEntry.getBitmap()), bitmapEntry.getTableIndex()); + this->operator++(); + } + + const IterationResult operator*() const { + return IterationResult(mTrieMap, mKey, mValue, mNextLevelBitmapEntryIndex); + } + + bool operator!=(const TrieMapIterator &other) const { + // Caveat: This works only for for loops. + return mIsValid || other.mIsValid; + } + + const TrieMapIterator &operator++() { + const Result result = mTrieMap->iterateNext(&mStateStack, &mKey); + mValue = result.mValue; + mIsValid = result.mIsValid; + mNextLevelBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + return *this; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapIterator); + DISALLOW_ASSIGNMENT_OPERATOR(TrieMapIterator); + + const TrieMap *const mTrieMap; + std::vector mStateStack; + const int mBaseBitmapEntryIndex; + int mKey; + uint64_t mValue; + bool mIsValid; + int mNextLevelBitmapEntryIndex; + }; + + /** + * Class to support iterating entries in TrieMap by range base for loops. + */ + class TrieMapRange { + public: + TrieMapRange(const TrieMap *const trieMap, const int bitmapEntryIndex) + : mTrieMap(trieMap), mBaseBitmapEntryIndex(bitmapEntryIndex) {}; + + TrieMapIterator begin() const { + return TrieMapIterator(mTrieMap, mBaseBitmapEntryIndex); + } + + const TrieMapIterator end() const { + return TrieMapIterator(nullptr, INVALID_INDEX); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapRange); + DISALLOW_ASSIGNMENT_OPERATOR(TrieMapRange); + + const TrieMap *const mTrieMap; + const int mBaseBitmapEntryIndex; + }; + + static const int INVALID_INDEX; + static const uint64_t MAX_VALUE; + + TrieMap(); + // Construct TrieMap using existing data in the memory region written by save(). + TrieMap(const ReadWriteByteArrayView buffer); + void dump(const int from = 0, const int to = 0) const; + + bool isNearSizeLimit() const { + return mBuffer.isNearSizeLimit(); + } + + int getRootBitmapEntryIndex() const { + return ROOT_BITMAP_ENTRY_INDEX; + } + + // Returns bitmapEntryIndex. Create the next level map if it doesn't exist. + int getNextLevelBitmapEntryIndex(const int key) { + return getNextLevelBitmapEntryIndex(key, ROOT_BITMAP_ENTRY_INDEX); + } + + int getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex); + + const Result getRoot(const int key) const { + return get(key, ROOT_BITMAP_ENTRY_INDEX); + } + + const Result get(const int key, const int bitmapEntryIndex) const; + + bool putRoot(const int key, const uint64_t value) { + return put(key, value, ROOT_BITMAP_ENTRY_INDEX); + } + + bool put(const int key, const uint64_t value, const int bitmapEntryIndex); + + const TrieMapRange getEntriesInRootLevel() const { + return getEntriesInSpecifiedLevel(ROOT_BITMAP_ENTRY_INDEX); + } + + const TrieMapRange getEntriesInSpecifiedLevel(const int bitmapEntryIndex) const { + return TrieMapRange(this, bitmapEntryIndex); + } + + bool save(FILE *const file) const; + + bool remove(const int key, const int bitmapEntryIndex); + + private: + DISALLOW_COPY_AND_ASSIGN(TrieMap); + + /** + * Struct represents an entry. + * + * Entry is one of these entry types. All entries are fixed size and have 2 fields FIELD_0 and + * FIELD_1. + * 1. bitmap entry. bitmap entry contains bitmap and the link to hash table. + * FIELD_0(bitmap) FIELD_1(LINK_TO_HASH_TABLE) + * 2. terminal entry. terminal entry contains hashed key and value or terminal link. terminal + * entry have terminal link when the value is not fit to FIELD_1 or there is a next level map + * for the key. + * FIELD_0(hashed key) (FIELD_1(VALUE_FLAG VALUE) | FIELD_1(TERMINAL_LINK_FLAG TERMINAL_LINK)) + * 3. value entry. value entry represents a value. Upper order bytes are stored in FIELD_0 and + * lower order bytes are stored in FIELD_1. + * FIELD_0(value (upper order bytes)) FIELD_1(value (lower order bytes)) + */ + struct Entry { + const uint32_t mData0; + const uint32_t mData1; + + Entry(const uint32_t data0, const uint32_t data1) : mData0(data0), mData1(data1) {} + + AK_FORCE_INLINE bool isBitmapEntry() const { + return (mData1 & VALUE_FLAG) == 0 && (mData1 & TERMINAL_LINK_FLAG) == 0; + } + + AK_FORCE_INLINE bool hasTerminalLink() const { + return (mData1 & TERMINAL_LINK_FLAG) != 0; + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getKey() const { + return mData0; + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getValue() const { + return mData1 & VALUE_MASK; + } + + // For terminal entry. + AK_FORCE_INLINE bool isValidTerminalEntry() const { + return hasTerminalLink() || ((mData1 & VALUE_MASK) != INVALID_VALUE_IN_KEY_VALUE_ENTRY); + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getValueEntryIndex() const { + return mData1 & TERMINAL_LINK_MASK; + } + + // For bitmap entry. + AK_FORCE_INLINE uint32_t getBitmap() const { + return mData0; + } + + // For bitmap entry. + AK_FORCE_INLINE int getTableIndex() const { + return static_cast(mData1); + } + + // For value entry. + AK_FORCE_INLINE uint64_t getValueOfValueEntry() const { + return ((static_cast(mData0) << (FIELD1_SIZE * CHAR_BIT)) ^ mData1); + } + }; + + BufferWithExtendableBuffer mBuffer; + + static const int FIELD0_SIZE; + static const int FIELD1_SIZE; + static const int ENTRY_SIZE; + static const uint32_t VALUE_FLAG; + static const uint32_t VALUE_MASK; + static const uint32_t INVALID_VALUE_IN_KEY_VALUE_ENTRY; + static const uint32_t TERMINAL_LINK_FLAG; + static const uint32_t TERMINAL_LINK_MASK; + static const int NUM_OF_BITS_USED_FOR_ONE_LEVEL; + static const uint32_t LABEL_MASK; + static const int MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; + static const int ROOT_BITMAP_ENTRY_INDEX; + static const int ROOT_BITMAP_ENTRY_POS; + static const Entry EMPTY_BITMAP_ENTRY; + static const int TERMINAL_LINKED_ENTRY_COUNT; + static const int MAX_BUFFER_SIZE; + + uint32_t getBitShuffledKey(const uint32_t key) const; + bool writeValue(const uint64_t value, const int terminalEntryIndex); + bool updateValue(const Entry &terminalEntry, const uint64_t value, + const int terminalEntryIndex); + bool freeTable(const int tableIndex, const int entryCount); + int allocateTable(const int entryCount); + int getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, + const Entry &bitmapEntry, const int level) const; + const Result getInternal(const uint32_t key, const uint32_t hashedKey, + const int bitmapEntryIndex, const int level) const; + bool putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, + const int bitmapEntryIndex, const Entry &bitmapEntry, const int level); + bool addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, + const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, + const int level); + bool addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, + const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, + const int label); + const Result iterateNext(std::vector *const iterationState, + int *const outKey) const; + + AK_FORCE_INLINE const Entry readEntry(const int entryIndex) const { + return Entry(readField0(entryIndex), readField1(entryIndex)); + } + + // Returns whether an entry for the index is existing by testing if the index-th bit in the + // bitmap is set or not. + AK_FORCE_INLINE bool exists(const uint32_t bitmap, const int index) const { + return (bitmap & (1 << index)) != 0; + } + + // Set index-th bit in the bitmap. + AK_FORCE_INLINE uint32_t setExist(const uint32_t bitmap, const int index) const { + return bitmap | (1 << index); + } + + // Count set bits before index in the bitmap. + AK_FORCE_INLINE int popCount(const uint32_t bitmap, const int index) const { + return popCount(bitmap & ((1 << index) - 1)); + } + + // Count set bits in the bitmap. + AK_FORCE_INLINE int popCount(const uint32_t bitmap) const { + return __builtin_popcount(bitmap); + // int v = bitmap - ((bitmap >> 1) & 0x55555555); + // v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + // return (((v + (v >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; + } + + AK_FORCE_INLINE int getLabel(const uint32_t hashedKey, const int level) const { + return (hashedKey >> (level * NUM_OF_BITS_USED_FOR_ONE_LEVEL)) & LABEL_MASK; + } + + AK_FORCE_INLINE uint32_t readField0(const int entryIndex) const { + return mBuffer.readUint(FIELD0_SIZE, ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); + } + + AK_FORCE_INLINE uint32_t readField1(const int entryIndex) const { + return mBuffer.readUint(FIELD1_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); + } + + AK_FORCE_INLINE int readEmptyTableLink(const int entryCount) const { + return mBuffer.readUint(FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); + } + + AK_FORCE_INLINE bool writeEmptyTableLink(const int tableIndex, const int entryCount) { + return mBuffer.writeUint(tableIndex, FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); + } + + AK_FORCE_INLINE bool writeField0(const uint32_t data, const int entryIndex) { + return mBuffer.writeUint(data, FIELD0_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); + } + + AK_FORCE_INLINE bool writeField1(const uint32_t data, const int entryIndex) { + return mBuffer.writeUint(data, FIELD1_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); + } + + AK_FORCE_INLINE bool writeEntry(const Entry &entry, const int entryIndex) { + return writeField0(entry.mData0, entryIndex) && writeField1(entry.mData1, entryIndex); + } + + AK_FORCE_INLINE bool writeTerminalEntry(const uint32_t key, const uint64_t value, + const int entryIndex) { + return writeField0(key, entryIndex) && writeValue(value, entryIndex); + } + + AK_FORCE_INLINE bool copyEntry(const int originalEntryIndex, const int newEntryIndex) { + return writeEntry(readEntry(originalEntryIndex), newEntryIndex); + } + + AK_FORCE_INLINE int getTailEntryIndex() const { + return (mBuffer.getTailPosition() - ROOT_BITMAP_ENTRY_POS) / ENTRY_SIZE; + } + + bool removeInner(const Entry &bitmapEntry); +}; + +} // namespace latinime +#endif /* LATINIME_TRIE_MAP_H */ diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node.cpp b/app/src/main/jni/src/suggest/core/dicnode/dic_node.cpp new file mode 100644 index 000000000..414dc3b1e --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dicnode/dic_node.h" + +namespace latinime { + +DicNode::DicNode(const DicNode &dicNode) + : +#if DEBUG_DICT + mProfiler(dicNode.mProfiler), +#endif + mDicNodeProperties(dicNode.mDicNodeProperties), mDicNodeState(dicNode.mDicNodeState), + mIsCachedForNextSuggestion(dicNode.mIsCachedForNextSuggestion) { + /* empty */ +} + +DicNode &DicNode::operator=(const DicNode &dicNode) { +#if DEBUG_DICT + mProfiler = dicNode.mProfiler; +#endif + mDicNodeProperties = dicNode.mDicNodeProperties; + mDicNodeState = dicNode.mDicNodeState; + mIsCachedForNextSuggestion = dicNode.mIsCachedForNextSuggestion; + return *this; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node.h b/app/src/main/jni/src/suggest/core/dicnode/dic_node.h new file mode 100644 index 000000000..5214077dc --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node.h @@ -0,0 +1,505 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_H +#define LATINIME_DIC_NODE_H + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_profiler.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dicnode/internal/dic_node_state.h" +#include "suggest/core/dicnode/internal/dic_node_properties.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" +#include "suggest/core/layout/proximity_info_state.h" +#include "utils/char_utils.h" +#include "utils/int_array_view.h" + +#if DEBUG_DICT +#define LOGI_SHOW_ADD_COST_PROP \ + do { \ + char charBuf[50]; \ + INTS_TO_CHARS(getOutputWordBuf(), getNodeCodePointCount(), charBuf, NELEMS(charBuf)); \ + AKLOGI("%20s, \"%c\", size = %03d, total = %03d, index(0) = %02d, dist = %.4f, %s,,", \ + __FUNCTION__, getNodeCodePoint(), inputSize, getTotalInputIndex(), \ + getInputIndex(0), getNormalizedCompoundDistance(), charBuf); \ + } while (0) +#define DUMP_WORD_AND_SCORE(header) \ + do { \ + char charBuf[50]; \ + INTS_TO_CHARS(getOutputWordBuf(), \ + getNodeCodePointCount() \ + + mDicNodeState.mDicNodeStateOutput.getPrevWordsLength(), \ + charBuf, NELEMS(charBuf)); \ + AKLOGI("#%8s, %5f, %5f, %5f, %5f, %s, %d, %5f,", header, \ + getSpatialDistanceForScoring(), \ + mDicNodeState.mDicNodeStateScoring.getLanguageDistance(), \ + getNormalizedCompoundDistance(), getRawLength(), charBuf, \ + getInputIndex(0), getNormalizedCompoundDistanceAfterFirstWord()); \ + } while (0) +#else +#define LOGI_SHOW_ADD_COST_PROP +#define DUMP_WORD_AND_SCORE(header) +#endif + +namespace latinime { + +// This struct is purely a bucket to return values. No instances of this struct should be kept. +struct DicNode_InputStateG { + DicNode_InputStateG() + : mNeedsToUpdateInputStateG(false), mPointerId(0), mInputIndex(0), + mPrevCodePoint(0), mTerminalDiffCost(0.0f), mRawLength(0.0f), + mDoubleLetterLevel(NOT_A_DOUBLE_LETTER) {} + + bool mNeedsToUpdateInputStateG; + int mPointerId; + int16_t mInputIndex; + int mPrevCodePoint; + float mTerminalDiffCost; + float mRawLength; + DoubleLetterLevel mDoubleLetterLevel; +}; + +class DicNode { + // Caveat: We define Weighting as a friend class of DicNode to let Weighting change + // the distance of DicNode. + // Caution!!! In general, we avoid using the "friend" access modifier. + // This is an exception to explicitly hide DicNode::addCost() from all classes but Weighting. + friend class Weighting; + + public: +#if DEBUG_DICT + DicNodeProfiler mProfiler; +#endif + + AK_FORCE_INLINE DicNode() + : +#if DEBUG_DICT + mProfiler(), +#endif + mDicNodeProperties(), mDicNodeState(), mIsCachedForNextSuggestion(false) {} + + DicNode(const DicNode &dicNode); + DicNode &operator=(const DicNode &dicNode); + ~DicNode() {} + + // Init for copy + void initByCopy(const DicNode *const dicNode) { + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + mDicNodeProperties.initByCopy(&dicNode->mDicNodeProperties); + mDicNodeState.initByCopy(&dicNode->mDicNodeState); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + // Init for root with prevWordIds which is used for n-gram + void initAsRoot(const int rootPtNodeArrayPos, const WordIdArrayView prevWordIds) { + mIsCachedForNextSuggestion = false; + mDicNodeProperties.init(rootPtNodeArrayPos, prevWordIds); + mDicNodeState.init(); + PROF_NODE_RESET(mProfiler); + } + + // Init for root with previous word + void initAsRootWithPreviousWord(const DicNode *const dicNode, const int rootPtNodeArrayPos) { + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + WordIdArray newPrevWordIds; + newPrevWordIds[0] = dicNode->mDicNodeProperties.getWordId(); + dicNode->getPrevWordIds().limit(newPrevWordIds.size() - 1) + .copyToArray(&newPrevWordIds, 1 /* offset */); + mDicNodeProperties.init(rootPtNodeArrayPos, WordIdArrayView::fromArray(newPrevWordIds)); + mDicNodeState.initAsRootWithPreviousWord(&dicNode->mDicNodeState, + dicNode->mDicNodeProperties.getDepth()); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + void initAsPassingChild(const DicNode *parentDicNode) { + mIsCachedForNextSuggestion = parentDicNode->mIsCachedForNextSuggestion; + const int codePoint = + parentDicNode->mDicNodeState.mDicNodeStateOutput.getCurrentWordCodePointAt( + parentDicNode->getNodeCodePointCount()); + mDicNodeProperties.init(&parentDicNode->mDicNodeProperties, codePoint); + mDicNodeState.initByCopy(&parentDicNode->mDicNodeState); + PROF_NODE_COPY(&parentDicNode->mProfiler, mProfiler); + } + + void initAsChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos, + const int wordId, const CodePointArrayView mergedCodePoints) { + uint16_t newDepth = static_cast(dicNode->getNodeCodePointCount() + 1); + mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; + const uint16_t newLeavingDepth = static_cast( + dicNode->mDicNodeProperties.getLeavingDepth() + mergedCodePoints.size()); + mDicNodeProperties.init(childrenPtNodeArrayPos, mergedCodePoints[0], + wordId, newDepth, newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordIds()); + mDicNodeState.init(&dicNode->mDicNodeState, mergedCodePoints.size(), + mergedCodePoints.data()); + PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); + } + + bool isRoot() const { + return getNodeCodePointCount() == 0; + } + + bool hasChildren() const { + return mDicNodeProperties.hasChildren(); + } + + bool isLeavingNode() const { + ASSERT(getNodeCodePointCount() <= mDicNodeProperties.getLeavingDepth()); + return getNodeCodePointCount() == mDicNodeProperties.getLeavingDepth(); + } + + AK_FORCE_INLINE bool isFirstLetter() const { + return getNodeCodePointCount() == 1; + } + + bool isCached() const { + return mIsCachedForNextSuggestion; + } + + void setCached() { + mIsCachedForNextSuggestion = true; + } + + // Check if the current word and the previous word can be considered as a valid multiple word + // suggestion. + bool isValidMultipleWordSuggestion() const { + // Treat suggestion as invalid if the current and the previous word are single character + // words. + const int prevWordLen = mDicNodeState.mDicNodeStateOutput.getPrevWordsLength() + - mDicNodeState.mDicNodeStateOutput.getPrevWordStart() - 1; + const int currentWordLen = getNodeCodePointCount(); + return (prevWordLen != 1 || currentWordLen != 1); + } + + bool isFirstCharUppercase() const { + const int c = mDicNodeState.mDicNodeStateOutput.getCurrentWordCodePointAt(0); + return CharUtils::isAsciiUpper(c); + } + + bool isCompletion(const int inputSize) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(0) >= inputSize; + } + + bool canDoLookAheadCorrection(const int inputSize) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1; + } + + // Used to get n-gram probability in DicNodeUtils. + int getWordId() const { + return mDicNodeProperties.getWordId(); + } + + const WordIdArrayView getPrevWordIds() const { + return mDicNodeProperties.getPrevWordIds(); + } + + // Used in DicNodeUtils + int getChildrenPtNodeArrayPos() const { + return mDicNodeProperties.getChildrenPtNodeArrayPos(); + } + + AK_FORCE_INLINE bool isTerminalDicNode() const { + const bool isTerminalPtNode = mDicNodeProperties.isTerminal(); + const int currentDicNodeDepth = getNodeCodePointCount(); + const int terminalDicNodeDepth = mDicNodeProperties.getLeavingDepth(); + return isTerminalPtNode && currentDicNodeDepth > 0 + && currentDicNodeDepth == terminalDicNodeDepth; + } + + bool shouldBeFilteredBySafetyNetForBigram() const { + const uint16_t currentDepth = getNodeCodePointCount(); + const int prevWordLen = mDicNodeState.mDicNodeStateOutput.getPrevWordsLength() + - mDicNodeState.mDicNodeStateOutput.getPrevWordStart() - 1; + return !(currentDepth > 0 && (currentDepth != 1 || prevWordLen != 1)); + } + + bool hasMatchedOrProximityCodePoints() const { + // This DicNode does not have matched or proximity code points when all code points have + // been handled as edit corrections or completion so far. + const int editCorrectionCount = mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount(); + const int completionCount = mDicNodeState.mDicNodeStateScoring.getCompletionCount(); + return (editCorrectionCount + completionCount) < getNodeCodePointCount(); + } + + bool isTotalInputSizeExceedingLimit() const { + // TODO: 3 can be 2? Needs to be investigated. + // TODO: Have a const variable for 3 (or 2) + return getTotalNodeCodePointCount() > MAX_WORD_LENGTH - 3; + } + + void outputResult(int *dest) const { + memmove(dest, getOutputWordBuf(), getTotalNodeCodePointCount() * sizeof(dest[0])); + DUMP_WORD_AND_SCORE("OUTPUT"); + } + + // "Total" in this context (and other methods in this class) means the whole suggestion. When + // this represents a multi-word suggestion, the referenced PtNode (in mDicNodeState) is only + // the one that corresponds to the last word of the suggestion, and all the previous words + // are concatenated together in mDicNodeStateOutput. + int getTotalNodeSpaceCount() const { + if (!hasMultipleWords()) { + return 0; + } + return CharUtils::getSpaceCount(mDicNodeState.mDicNodeStateOutput.getCodePointBuf(), + mDicNodeState.mDicNodeStateOutput.getPrevWordsLength()); + } + + int getSecondWordFirstInputIndex(const ProximityInfoState *const pInfoState) const { + const int inputIndex = mDicNodeState.mDicNodeStateOutput.getSecondWordFirstInputIndex(); + if (inputIndex == NOT_AN_INDEX) { + return NOT_AN_INDEX; + } else { + return pInfoState->getInputIndexOfSampledPoint(inputIndex); + } + } + + bool hasMultipleWords() const { + return mDicNodeState.mDicNodeStateOutput.getPrevWordCount() > 0; + } + + int getProximityCorrectionCount() const { + return mDicNodeState.mDicNodeStateScoring.getProximityCorrectionCount(); + } + + int getEditCorrectionCount() const { + return mDicNodeState.mDicNodeStateScoring.getEditCorrectionCount(); + } + + // Used to prune nodes + float getNormalizedCompoundDistance() const { + return mDicNodeState.mDicNodeStateScoring.getNormalizedCompoundDistance(); + } + + // Used to prune nodes + float getNormalizedSpatialDistance() const { + return mDicNodeState.mDicNodeStateScoring.getSpatialDistance() + / static_cast(getInputIndex(0) + 1); + } + + // Used to prune nodes + float getCompoundDistance() const { + return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(); + } + + // Used to prune nodes + float getCompoundDistance(const float weightOfLangModelVsSpatialModel) const { + return mDicNodeState.mDicNodeStateScoring.getCompoundDistance( + weightOfLangModelVsSpatialModel); + } + + AK_FORCE_INLINE const int *getOutputWordBuf() const { + return mDicNodeState.mDicNodeStateOutput.getCodePointBuf(); + } + + int getPrevCodePointG(int pointerId) const { + return mDicNodeState.mDicNodeStateInput.getPrevCodePoint(pointerId); + } + + // Whether the current codepoint can be an intentional omission, in which case the traversal + // algorithm will always check for a possible omission here. + bool canBeIntentionalOmission() const { + return CharUtils::isIntentionalOmissionCodePoint(getNodeCodePoint()); + } + + // Whether the omission is so frequent that it should incur zero cost. + bool isZeroCostOmission() const { + // TODO: do not hardcode and read from header + return (getNodeCodePoint() == KEYCODE_SINGLE_QUOTE); + } + + // TODO: remove + float getTerminalDiffCostG(int path) const { + return mDicNodeState.mDicNodeStateInput.getTerminalDiffCost(path); + } + + ////////////////////// + // Temporary getter // + // TODO: Remove // + ////////////////////// + // TODO: Remove once touch path is merged into ProximityInfoState + // Note: Returned codepoint may be a digraph codepoint if the node is in a composite glyph. + int getNodeCodePoint() const { + const int codePoint = mDicNodeProperties.getDicNodeCodePoint(); + const DigraphUtils::DigraphCodePointIndex digraphIndex = + mDicNodeState.mDicNodeStateScoring.getDigraphIndex(); + if (digraphIndex == DigraphUtils::NOT_A_DIGRAPH_INDEX) { + return codePoint; + } + return DigraphUtils::getDigraphCodePointForIndex(codePoint, digraphIndex); + } + + //////////////////////////////// + // Utils for cost calculation // + //////////////////////////////// + AK_FORCE_INLINE bool isSameNodeCodePoint(const DicNode *const dicNode) const { + return mDicNodeProperties.getDicNodeCodePoint() + == dicNode->mDicNodeProperties.getDicNodeCodePoint(); + } + + // TODO: remove + // TODO: rename getNextInputIndex + int16_t getInputIndex(int pointerId) const { + return mDicNodeState.mDicNodeStateInput.getInputIndex(pointerId); + } + + //////////////////////////////////// + // Getter of features for scoring // + //////////////////////////////////// + float getSpatialDistanceForScoring() const { + return mDicNodeState.mDicNodeStateScoring.getSpatialDistance(); + } + + // For space-aware gestures, we store the normalized distance at the char index + // that ends the first word of the suggestion. We call this the distance after + // first word. + float getNormalizedCompoundDistanceAfterFirstWord() const { + return mDicNodeState.mDicNodeStateScoring.getNormalizedCompoundDistanceAfterFirstWord(); + } + + float getRawLength() const { + return mDicNodeState.mDicNodeStateScoring.getRawLength(); + } + + DoubleLetterLevel getDoubleLetterLevel() const { + return mDicNodeState.mDicNodeStateScoring.getDoubleLetterLevel(); + } + + void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) { + mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(doubleLetterLevel); + } + + bool isInDigraph() const { + return mDicNodeState.mDicNodeStateScoring.getDigraphIndex() + != DigraphUtils::NOT_A_DIGRAPH_INDEX; + } + + void advanceDigraphIndex() { + mDicNodeState.mDicNodeStateScoring.advanceDigraphIndex(); + } + + ErrorTypeUtils::ErrorType getContainedErrorTypes() const { + return mDicNodeState.mDicNodeStateScoring.getContainedErrorTypes(); + } + + inline uint16_t getNodeCodePointCount() const { + return mDicNodeProperties.getDepth(); + } + + // Returns code point count including spaces + inline uint16_t getTotalNodeCodePointCount() const { + return getNodeCodePointCount() + mDicNodeState.mDicNodeStateOutput.getPrevWordsLength(); + } + + AK_FORCE_INLINE void dump(const char *tag) const { +#if DEBUG_DICT + DUMP_WORD_AND_SCORE(tag); +#if DEBUG_DUMP_ERROR + mProfiler.dump(); +#endif +#endif + } + + AK_FORCE_INLINE bool compare(const DicNode *right) const { + // Promote exact matches to prevent them from being pruned. + const bool leftExactMatch = ErrorTypeUtils::isExactMatch(getContainedErrorTypes()); + const bool rightExactMatch = ErrorTypeUtils::isExactMatch(right->getContainedErrorTypes()); + if (leftExactMatch != rightExactMatch) { + return leftExactMatch; + } + const float diff = + right->getNormalizedCompoundDistance() - getNormalizedCompoundDistance(); + static const float MIN_DIFF = 0.000001f; + if (diff > MIN_DIFF) { + return true; + } else if (diff < -MIN_DIFF) { + return false; + } + const int depth = getNodeCodePointCount(); + const int depthDiff = right->getNodeCodePointCount() - depth; + if (depthDiff != 0) { + return depthDiff > 0; + } + for (int i = 0; i < depth; ++i) { + const int codePoint = mDicNodeState.mDicNodeStateOutput.getCurrentWordCodePointAt(i); + const int rightCodePoint = + right->mDicNodeState.mDicNodeStateOutput.getCurrentWordCodePointAt(i); + if (codePoint != rightCodePoint) { + return rightCodePoint > codePoint; + } + } + // Compare pointer values here for stable comparison + return this > right; + } + + private: + DicNodeProperties mDicNodeProperties; + DicNodeState mDicNodeState; + // TODO: Remove + bool mIsCachedForNextSuggestion; + + AK_FORCE_INLINE int getTotalInputIndex() const { + int index = 0; + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + index += mDicNodeState.mDicNodeStateInput.getInputIndex(i); + } + return index; + } + + // Caveat: Must not be called outside Weighting + // This restriction is guaranteed by "friend" + AK_FORCE_INLINE void addCost(const float spatialCost, const float languageCost, + const bool doNormalization, const int inputSize, + const ErrorTypeUtils::ErrorType errorType) { + if (DEBUG_GEO_FULL) { + LOGI_SHOW_ADD_COST_PROP; + } + mDicNodeState.mDicNodeStateScoring.addCost(spatialCost, languageCost, doNormalization, + inputSize, getTotalInputIndex(), errorType); + } + + // Saves the current normalized compound distance for space-aware gestures. + // See getNormalizedCompoundDistanceAfterFirstWord for details. + AK_FORCE_INLINE void saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet() { + mDicNodeState.mDicNodeStateScoring.saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet(); + } + + // Caveat: Must not be called outside Weighting + // This restriction is guaranteed by "friend" + AK_FORCE_INLINE void forwardInputIndex(const int pointerId, const int count, + const bool overwritesPrevCodePointByNodeCodePoint) { + if (count == 0) { + return; + } + mDicNodeState.mDicNodeStateInput.forwardInputIndex(pointerId, count); + if (overwritesPrevCodePointByNodeCodePoint) { + mDicNodeState.mDicNodeStateInput.setPrevCodePoint(0, getNodeCodePoint()); + } + } + + AK_FORCE_INLINE void updateInputIndexG(const DicNode_InputStateG *const inputStateG) { + if (mDicNodeState.mDicNodeStateOutput.getPrevWordCount() == 1 && isFirstLetter()) { + mDicNodeState.mDicNodeStateOutput.setSecondWordFirstInputIndex( + inputStateG->mInputIndex); + } + mDicNodeState.mDicNodeStateInput.updateInputIndexG(inputStateG->mPointerId, + inputStateG->mInputIndex, inputStateG->mPrevCodePoint, + inputStateG->mTerminalDiffCost, inputStateG->mRawLength); + mDicNodeState.mDicNodeStateScoring.addRawLength(inputStateG->mRawLength); + mDicNodeState.mDicNodeStateScoring.setDoubleLetterLevel(inputStateG->mDoubleLetterLevel); + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node_pool.h b/app/src/main/jni/src/suggest/core/dicnode/dic_node_pool.h new file mode 100644 index 000000000..a660b744f --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node_pool.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_POOL_H +#define LATINIME_DIC_NODE_POOL_H + +#include +#include +#include + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" + +namespace latinime { + +class DicNodePool { + public: + explicit DicNodePool(const int capacity) : mDicNodes(), mPooledDicNodes() { + reset(capacity); + } + + void reset(const int capacity) { + if (capacity == static_cast(mDicNodes.size()) + && capacity == static_cast(mPooledDicNodes.size())) { + // No need to reset. + return; + } + mDicNodes.resize(capacity); + mDicNodes.shrink_to_fit(); + mPooledDicNodes.clear(); + for (auto &dicNode : mDicNodes) { + mPooledDicNodes.emplace_back(&dicNode); + } + } + + // Get a DicNode instance from the pool. The instance has to be returned by returnInstance(). + DicNode *getInstance() { + if (mPooledDicNodes.empty()) { + return nullptr; + } + DicNode *const dicNode = mPooledDicNodes.back(); + mPooledDicNodes.pop_back(); + return dicNode; + } + + // Return an instance that has been removed from the pool by getInstance() to the pool. The + // instance must not be used after returning without getInstance(). + void placeBackInstance(DicNode *dicNode) { + mPooledDicNodes.emplace_back(dicNode); + } + + void dump() const { + AKLOGI("\n\n\n\n\n==========================="); + std::unordered_set usedDicNodes; + for (const auto &dicNode : mDicNodes) { + usedDicNodes.insert(&dicNode); + } + for (const auto &dicNodePtr : mPooledDicNodes) { + usedDicNodes.erase(dicNodePtr); + } + for (const auto &usedDicNodePtr : usedDicNodes) { + usedDicNodePtr->dump("DIC_NODE_POOL: "); + } + AKLOGI("===========================\n\n\n\n\n"); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodePool); + + std::vector mDicNodes; + std::deque mPooledDicNodes; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_POOL_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node_priority_queue.h b/app/src/main/jni/src/suggest/core/dicnode/dic_node_priority_queue.h new file mode 100644 index 000000000..7b753f2e4 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node_priority_queue.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PRIORITY_QUEUE_H +#define LATINIME_DIC_NODE_PRIORITY_QUEUE_H + +#include +#include +#include + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_pool.h" + +namespace latinime { + +class DicNodePriorityQueue { + public: + AK_FORCE_INLINE explicit DicNodePriorityQueue(const int capacity) + : mMaxSize(capacity), mDicNodesQueue(), mDicNodePool(capacity) { + clear(); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicNodePriorityQueue() {} + + AK_FORCE_INLINE int getSize() const { + return static_cast(mDicNodesQueue.size()); + } + + AK_FORCE_INLINE int getMaxSize() const { + return mMaxSize; + } + + AK_FORCE_INLINE void setMaxSize(const int maxSize) { + mMaxSize = maxSize; + } + + AK_FORCE_INLINE void clear() { + clearAndResize(mMaxSize); + } + + AK_FORCE_INLINE void clearAndResize(const int maxSize) { + mMaxSize = maxSize; + while (!mDicNodesQueue.empty()) { + mDicNodesQueue.pop(); + } + mDicNodePool.reset(mMaxSize + 1); + } + + AK_FORCE_INLINE void copyPush(const DicNode *const dicNode) { + DicNode *const pooledDicNode = newDicNode(dicNode); + if (!pooledDicNode) { + return; + } + if (getSize() < mMaxSize) { + mDicNodesQueue.push(pooledDicNode); + return; + } + if (betterThanWorstDicNode(pooledDicNode)) { + mDicNodePool.placeBackInstance(mDicNodesQueue.top()); + mDicNodesQueue.pop(); + mDicNodesQueue.push(pooledDicNode); + return; + } + mDicNodePool.placeBackInstance(pooledDicNode); + } + + AK_FORCE_INLINE void copyPop(DicNode *const dest) { + if (mDicNodesQueue.empty()) { + ASSERT(false); + return; + } + DicNode *node = mDicNodesQueue.top(); + if (dest) { + DicNodeUtils::initByCopy(node, dest); + } + mDicNodePool.placeBackInstance(node); + mDicNodesQueue.pop(); + } + + AK_FORCE_INLINE void dump() { + mDicNodePool.dump(); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodePriorityQueue); + + AK_FORCE_INLINE static bool compareDicNode(const DicNode *const left, + const DicNode *const right) { + return left->compare(right); + } + + struct DicNodeComparator { + bool operator ()(const DicNode *left, const DicNode *right) const { + return compareDicNode(left, right); + } + }; + + typedef std::priority_queue, DicNodeComparator> DicNodesQueue; + int mMaxSize; + DicNodesQueue mDicNodesQueue; + DicNodePool mDicNodePool; + + AK_FORCE_INLINE bool betterThanWorstDicNode(const DicNode *const dicNode) const { + DicNode *worstNode = mDicNodesQueue.top(); + if (!worstNode) { + return true; + } + return compareDicNode(dicNode, worstNode); + } + + AK_FORCE_INLINE DicNode *newDicNode(const DicNode *const dicNode) { + DicNode *newNode = mDicNodePool.getInstance(); + if (newNode) { + DicNodeUtils::initByCopy(dicNode, newNode); + } + return newNode; + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_PRIORITY_QUEUE_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node_profiler.h b/app/src/main/jni/src/suggest/core/dicnode/dic_node_profiler.h new file mode 100644 index 000000000..1f4d2570e --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node_profiler.h @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PROFILER_H +#define LATINIME_DIC_NODE_PROFILER_H + +#include "defines.h" + +#if DEBUG_DICT +#define PROF_SPACE_SUBSTITUTION(profiler) profiler.profSpaceSubstitution() +#define PROF_SPACE_OMISSION(profiler) profiler.profSpaceOmission() +#define PROF_ADDITIONAL_PROXIMITY(profiler) profiler.profAdditionalProximity() +#define PROF_SUBSTITUTION(profiler) profiler.profSubstitution() +#define PROF_OMISSION(profiler) profiler.profOmission() +#define PROF_INSERTION(profiler) profiler.profInsertion() +#define PROF_MATCH(profiler) profiler.profMatch() +#define PROF_COMPLETION(profiler) profiler.profCompletion() +#define PROF_TRANSPOSITION(profiler) profiler.profTransposition() +#define PROF_NEARESTKEY(profiler) profiler.profNearestKey() +#define PROF_TERMINAL(profiler) profiler.profTerminal() +#define PROF_TERMINAL_INSERTION(profiler) profiler.profTerminalInsertion() +#define PROF_NEW_WORD(profiler) profiler.profNewWord() +#define PROF_NEW_WORD_BIGRAM(profiler) profiler.profNewWordBigram() +#define PROF_NODE_RESET(profiler) profiler.reset() +#define PROF_NODE_COPY(src, dest) dest.copy(src) +#else +#define PROF_SPACE_SUBSTITUTION(profiler) +#define PROF_SPACE_OMISSION(profiler) +#define PROF_ADDITONAL_PROXIMITY(profiler) +#define PROF_SUBSTITUTION(profiler) +#define PROF_OMISSION(profiler) +#define PROF_INSERTION(profiler) +#define PROF_MATCH(profiler) +#define PROF_COMPLETION(profiler) +#define PROF_TRANSPOSITION(profiler) +#define PROF_NEARESTKEY(profiler) +#define PROF_TERMINAL(profiler) +#define PROF_TERMINAL_INSERTION(profiler) +#define PROF_NEW_WORD(profiler) +#define PROF_NEW_WORD_BIGRAM(profiler) +#define PROF_NODE_RESET(profiler) +#define PROF_NODE_COPY(src, dest) +#endif + +namespace latinime { + +class DicNodeProfiler { + public: +#if DEBUG_DICT + AK_FORCE_INLINE DicNodeProfiler() + : mProfOmission(0), mProfInsertion(0), mProfTransposition(0), + mProfAdditionalProximity(0), mProfSubstitution(0), + mProfSpaceSubstitution(0), mProfSpaceOmission(0), + mProfMatch(0), mProfCompletion(0), mProfTerminal(0), mProfTerminalInsertion(0), + mProfNearestKey(0), mProfNewWord(0), mProfNewWordBigram(0) {} + + int mProfOmission; + int mProfInsertion; + int mProfTransposition; + int mProfAdditionalProximity; + int mProfSubstitution; + int mProfSpaceSubstitution; + int mProfSpaceOmission; + int mProfMatch; + int mProfCompletion; + int mProfTerminal; + int mProfTerminalInsertion; + int mProfNearestKey; + int mProfNewWord; + int mProfNewWordBigram; + + void profSpaceSubstitution() { + ++mProfSpaceSubstitution; + } + + void profSpaceOmission() { + ++mProfSpaceOmission; + } + + void profAdditionalProximity() { + ++mProfAdditionalProximity; + } + + void profSubstitution() { + ++mProfSubstitution; + } + + void profOmission() { + ++mProfOmission; + } + + void profInsertion() { + ++mProfInsertion; + } + + void profMatch() { + ++mProfMatch; + } + + void profCompletion() { + ++mProfCompletion; + } + + void profTransposition() { + ++mProfTransposition; + } + + void profNearestKey() { + ++mProfNearestKey; + } + + void profTerminal() { + ++mProfTerminal; + } + + void profTerminalInsertion() { + ++mProfTerminalInsertion; + } + + void profNewWord() { + ++mProfNewWord; + } + + void profNewWordBigram() { + ++mProfNewWordBigram; + } + + void reset() { + mProfSpaceSubstitution = 0; + mProfSpaceOmission = 0; + mProfAdditionalProximity = 0; + mProfSubstitution = 0; + mProfOmission = 0; + mProfInsertion = 0; + mProfMatch = 0; + mProfCompletion = 0; + mProfTransposition = 0; + mProfNearestKey = 0; + mProfTerminal = 0; + mProfNewWord = 0; + mProfNewWordBigram = 0; + } + + void copy(const DicNodeProfiler *const profiler) { + mProfSpaceSubstitution = profiler->mProfSpaceSubstitution; + mProfSpaceOmission = profiler->mProfSpaceOmission; + mProfAdditionalProximity = profiler->mProfAdditionalProximity; + mProfSubstitution = profiler->mProfSubstitution; + mProfOmission = profiler->mProfOmission; + mProfInsertion = profiler->mProfInsertion; + mProfMatch = profiler->mProfMatch; + mProfCompletion = profiler->mProfCompletion; + mProfTransposition = profiler->mProfTransposition; + mProfNearestKey = profiler->mProfNearestKey; + mProfTerminal = profiler->mProfTerminal; + mProfNewWord = profiler->mProfNewWord; + mProfNewWordBigram = profiler->mProfNewWordBigram; + } + + void dump() const { + AKLOGI("O %d, I %d, T %d, AP %d, S %d, SS %d, SO %d, M %d, C %d, TE %d, NW = %d, NWB = %d", + mProfOmission, mProfInsertion, mProfTransposition, mProfAdditionalProximity, + mProfSubstitution, mProfSpaceSubstitution, mProfSpaceOmission, mProfMatch, + mProfCompletion, mProfTerminal, mProfNewWord, mProfNewWordBigram); + } +#else + DicNodeProfiler() {} +#endif + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class +}; +} +#endif // LATINIME_DIC_NODE_PROFILER_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/app/src/main/jni/src/suggest/core/dicnode/dic_node_utils.cpp new file mode 100644 index 000000000..a20252cd2 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dicnode/dic_node_utils.h" + +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" + +namespace latinime { + +/////////////////////////////// +// Node initialization utils // +/////////////////////////////// + +/* static */ void DicNodeUtils::initAsRoot( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const WordIdArrayView prevWordIds, DicNode *const newRootDicNode) { + newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordIds); +} + +/*static */ void DicNodeUtils::initAsRootWithPreviousWord( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode) { + newRootDicNode->initAsRootWithPreviousWord( + prevWordLastDicNode, dictionaryStructurePolicy->getRootPosition()); +} + +/* static */ void DicNodeUtils::initByCopy(const DicNode *const srcDicNode, + DicNode *const destDicNode) { + destDicNode->initByCopy(srcDicNode); +} + +/////////////////////////////////// +// Traverse node expansion utils // +/////////////////////////////////// +/* static */ void DicNodeUtils::getAllChildDicNodes(const DicNode *dicNode, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + DicNodeVector *const childDicNodes) { + if (dicNode->isTotalInputSizeExceedingLimit()) { + return; + } + if (!dicNode->isLeavingNode()) { + childDicNodes->pushPassingChild(dicNode); + } else { + dictionaryStructurePolicy->createAndGetAllChildDicNodes(dicNode, childDicNodes); + } +} + +/////////////////// +// Scoring utils // +/////////////////// +/** + * Computes the combined bigram / unigram cost for the given dicNode. + */ +/* static */ float DicNodeUtils::getBigramNodeImprobability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) { + if (dicNode->hasMultipleWords() && !dicNode->isValidMultipleWordSuggestion()) { + return static_cast(MAX_VALUE_FOR_WEIGHTING); + } + const WordAttributes wordAttributes = dictionaryStructurePolicy->getWordAttributesInContext( + dicNode->getPrevWordIds(), dicNode->getWordId(), multiBigramMap); + if (wordAttributes.getProbability() == NOT_A_PROBABILITY + || (dicNode->hasMultipleWords() + && (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()))) { + return static_cast(MAX_VALUE_FOR_WEIGHTING); + } + // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. + const float cost = static_cast(MAX_PROBABILITY - wordAttributes.getProbability()) + / static_cast(MAX_PROBABILITY); + return cost; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node_utils.h b/app/src/main/jni/src/suggest/core/dicnode/dic_node_utils.h new file mode 100644 index 000000000..b891a842a --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_UTILS_H +#define LATINIME_DIC_NODE_UTILS_H + +#include "defines.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; +class DictionaryStructureWithBufferPolicy; +class MultiBigramMap; + +class DicNodeUtils { + public: + static void initAsRoot( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const WordIdArrayView prevWordIds, DicNode *const newRootDicNode); + static void initAsRootWithPreviousWord( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode); + static void initByCopy(const DicNode *const srcDicNode, DicNode *const destDicNode); + static void getAllChildDicNodes(const DicNode *dicNode, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + DicNodeVector *childDicNodes); + static float getBigramNodeImprobability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const dicNode, MultiBigramMap *const multiBigramMap); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils); + // Max number of bigrams to look up + static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_node_vector.h b/app/src/main/jni/src/suggest/core/dicnode/dic_node_vector.h new file mode 100644 index 000000000..e6b758954 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_VECTOR_H +#define LATINIME_DIC_NODE_VECTOR_H + +#include + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DicNodeVector { + public: +#ifdef FLAG_DBG + // 0 will introduce resizing the vector. + static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 0; +#else + static const int DEFAULT_NODES_SIZE_FOR_OPTIMIZATION = 60; +#endif + AK_FORCE_INLINE DicNodeVector() : mDicNodes(), mLock(false) {} + + // Specify the capacity of the vector + AK_FORCE_INLINE DicNodeVector(const int size) : mDicNodes(), mLock(false) { + mDicNodes.reserve(size); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicNodeVector() {} + + AK_FORCE_INLINE void clear() { + mDicNodes.clear(); + mLock = false; + } + + int getSizeAndLock() { + mLock = true; + return static_cast(mDicNodes.size()); + } + + void pushPassingChild(const DicNode *dicNode) { + ASSERT(!mLock); + mDicNodes.emplace_back(); + mDicNodes.back().initAsPassingChild(dicNode); + } + + void pushLeavingChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos, + const int wordId, const CodePointArrayView mergedCodePoints) { + ASSERT(!mLock); + mDicNodes.emplace_back(); + mDicNodes.back().initAsChild(dicNode, childrenPtNodeArrayPos, wordId, mergedCodePoints); + } + + DicNode *operator[](const int id) { + ASSERT(id < static_cast(mDicNodes.size())); + return &mDicNodes[id]; + } + + DicNode *front() { + ASSERT(1 <= static_cast(mDicNodes.size())); + return &mDicNodes.front(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeVector); + std::vector mDicNodes; + bool mLock; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_VECTOR_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_nodes_cache.cpp b/app/src/main/jni/src/suggest/core/dicnode/dic_nodes_cache.cpp new file mode 100644 index 000000000..ef4a6b5d8 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_nodes_cache.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dicnode/dic_nodes_cache.h" + +namespace latinime { + +// The biggest value among MAX_CACHE_DIC_NODE_SIZE, MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT, ... +const int DicNodesCache::LARGE_PRIORITY_QUEUE_CAPACITY = 310; +// Capacity for reducing memory footprint. +const int DicNodesCache::SMALL_PRIORITY_QUEUE_CAPACITY = 100; + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/dicnode/dic_nodes_cache.h b/app/src/main/jni/src/suggest/core/dicnode/dic_nodes_cache.h new file mode 100644 index 000000000..fb76c731f --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/dic_nodes_cache.h @@ -0,0 +1,190 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODES_CACHE_H +#define LATINIME_DIC_NODES_CACHE_H + +#include + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" + +namespace latinime { + +class DicNode; + +/** + * Class for controlling dicNode search priority queue and lexicon trie traversal. + */ +class DicNodesCache { + public: + AK_FORCE_INLINE explicit DicNodesCache(const bool usesLargeCapacityCache) + : mUsesLargeCapacityCache(usesLargeCapacityCache), + mDicNodePriorityQueue0(getCacheCapacity()), + mDicNodePriorityQueue1(getCacheCapacity()), + mDicNodePriorityQueue2(getCacheCapacity()), + mDicNodePriorityQueueForTerminal(MAX_RESULTS), + mActiveDicNodes(&mDicNodePriorityQueue0), + mNextActiveDicNodes(&mDicNodePriorityQueue1), + mCachedDicNodesForContinuousSuggestion(&mDicNodePriorityQueue2), + mTerminalDicNodes(&mDicNodePriorityQueueForTerminal), + mInputIndex(0), mLastCachedInputIndex(0) {} + + AK_FORCE_INLINE virtual ~DicNodesCache() {} + + AK_FORCE_INLINE void reset(const int nextActiveSize, const int terminalSize) { + mInputIndex = 0; + mLastCachedInputIndex = 0; + // The size of current active DicNode queue doesn't have to be changed. + mActiveDicNodes->clear(); + // nextActiveSize is used to limit the next iteration's active DicNode size. + const int nextActiveSizeFittingToTheCapacity = std::min(nextActiveSize, getCacheCapacity()); + mNextActiveDicNodes->clearAndResize(nextActiveSizeFittingToTheCapacity); + mTerminalDicNodes->clearAndResize(terminalSize); + // The size of cached DicNode queue doesn't have to be changed. + mCachedDicNodesForContinuousSuggestion->clear(); + } + + AK_FORCE_INLINE void continueSearch() { + resetTemporaryCaches(); + restoreActiveDicNodesFromCache(); + } + + AK_FORCE_INLINE void advanceActiveDicNodes() { + if (DEBUG_DICT) { + AKLOGI("Advance active %d nodes.", mNextActiveDicNodes->getSize()); + } + if (DEBUG_DICT_FULL) { + mNextActiveDicNodes->dump(); + } + mNextActiveDicNodes = + moveNodesAndReturnReusableEmptyQueue(mNextActiveDicNodes, &mActiveDicNodes); + } + + int activeSize() const { return mActiveDicNodes->getSize(); } + int terminalSize() const { return mTerminalDicNodes->getSize(); } + bool isLookAheadCorrectionInputIndex(const int inputIndex) const { + return inputIndex == mInputIndex - 1; + } + void advanceInputIndex(const int inputSize) { + if (mInputIndex < inputSize) { + mInputIndex++; + } + } + + AK_FORCE_INLINE void copyPushTerminal(DicNode *dicNode) { + mTerminalDicNodes->copyPush(dicNode); + } + + AK_FORCE_INLINE void copyPushActive(DicNode *dicNode) { + mActiveDicNodes->copyPush(dicNode); + } + + AK_FORCE_INLINE void copyPushContinue(DicNode *dicNode) { + mCachedDicNodesForContinuousSuggestion->copyPush(dicNode); + } + + AK_FORCE_INLINE void copyPushNextActive(DicNode *dicNode) { + mNextActiveDicNodes->copyPush(dicNode); + } + + void popTerminal(DicNode *dest) { + mTerminalDicNodes->copyPop(dest); + } + + void popActive(DicNode *dest) { + mActiveDicNodes->copyPop(dest); + } + + bool hasCachedDicNodesForContinuousSuggestion() const { + return mCachedDicNodesForContinuousSuggestion + && mCachedDicNodesForContinuousSuggestion->getSize() > 0; + } + + AK_FORCE_INLINE bool isCacheBorderForTyping(const int inputSize) const { + // TODO: Move this variable to header + static const int CACHE_BACK_LENGTH = 3; + const int cacheInputIndex = inputSize - CACHE_BACK_LENGTH; + const bool shouldCache = (cacheInputIndex == mInputIndex) + && (cacheInputIndex != mLastCachedInputIndex); + return shouldCache; + } + + AK_FORCE_INLINE void updateLastCachedInputIndex() { + mLastCachedInputIndex = mInputIndex; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodesCache); + + AK_FORCE_INLINE void restoreActiveDicNodesFromCache() { + if (DEBUG_DICT) { + AKLOGI("Restore %d nodes. inputIndex = %d.", + mCachedDicNodesForContinuousSuggestion->getSize(), mLastCachedInputIndex); + } + if (DEBUG_DICT_FULL || DEBUG_CACHE) { + mCachedDicNodesForContinuousSuggestion->dump(); + } + mInputIndex = mLastCachedInputIndex; + mCachedDicNodesForContinuousSuggestion = moveNodesAndReturnReusableEmptyQueue( + mCachedDicNodesForContinuousSuggestion, &mActiveDicNodes); + } + + AK_FORCE_INLINE static DicNodePriorityQueue *moveNodesAndReturnReusableEmptyQueue( + DicNodePriorityQueue *src, DicNodePriorityQueue **dest) { + const int srcMaxSize = src->getMaxSize(); + const int destMaxSize = (*dest)->getMaxSize(); + DicNodePriorityQueue *tmp = *dest; + *dest = src; + (*dest)->setMaxSize(destMaxSize); + tmp->clearAndResize(srcMaxSize); + return tmp; + } + + AK_FORCE_INLINE int getCacheCapacity() const { + return mUsesLargeCapacityCache ? + LARGE_PRIORITY_QUEUE_CAPACITY : SMALL_PRIORITY_QUEUE_CAPACITY; + } + + AK_FORCE_INLINE void resetTemporaryCaches() { + mActiveDicNodes->clear(); + mNextActiveDicNodes->clear(); + mTerminalDicNodes->clear(); + } + + static const int LARGE_PRIORITY_QUEUE_CAPACITY; + static const int SMALL_PRIORITY_QUEUE_CAPACITY; + + const bool mUsesLargeCapacityCache; + // Instances + DicNodePriorityQueue mDicNodePriorityQueue0; + DicNodePriorityQueue mDicNodePriorityQueue1; + DicNodePriorityQueue mDicNodePriorityQueue2; + DicNodePriorityQueue mDicNodePriorityQueueForTerminal; + + // Active dicNodes currently being expanded. + DicNodePriorityQueue *mActiveDicNodes; + // Next dicNodes to be expanded. + DicNodePriorityQueue *mNextActiveDicNodes; + // Cached dicNodes used for continuous suggestion. + DicNodePriorityQueue *mCachedDicNodesForContinuousSuggestion; + // Current top terminal dicNodes. + DicNodePriorityQueue *mTerminalDicNodes; + int mInputIndex; + int mLastCachedInputIndex; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODES_CACHE_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_properties.h b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_properties.h new file mode 100644 index 000000000..1b796b5d4 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_properties.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_PROPERTIES_H +#define LATINIME_DIC_NODE_PROPERTIES_H + +#include +#include + +#include "defines.h" +#include "utils/int_array_view.h" + +namespace latinime { + +/** + * PtNode information related to the DicNode from the lexicon trie. + */ +class DicNodeProperties { + public: + AK_FORCE_INLINE DicNodeProperties() + : mChildrenPtNodeArrayPos(NOT_A_DICT_POS), mDicNodeCodePoint(NOT_A_CODE_POINT), + mWordId(NOT_A_WORD_ID), mDepth(0), mLeavingDepth(0), mPrevWordCount(0) {} + + ~DicNodeProperties() {} + + // Should be called only once per DicNode is initialized. + void init(const int childrenPos, const int nodeCodePoint, const int wordId, + const uint16_t depth, const uint16_t leavingDepth, const WordIdArrayView prevWordIds) { + mChildrenPtNodeArrayPos = childrenPos; + mDicNodeCodePoint = nodeCodePoint; + mWordId = wordId; + mDepth = depth; + mLeavingDepth = leavingDepth; + prevWordIds.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIds.size(); + } + + // Init for root with prevWordsPtNodePos which is used for n-gram + void init(const int rootPtNodeArrayPos, const WordIdArrayView prevWordIds) { + mChildrenPtNodeArrayPos = rootPtNodeArrayPos; + mDicNodeCodePoint = NOT_A_CODE_POINT; + mWordId = NOT_A_WORD_ID; + mDepth = 0; + mLeavingDepth = 0; + prevWordIds.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIds.size(); + } + + void initByCopy(const DicNodeProperties *const dicNodeProp) { + mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; + mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint; + mWordId = dicNodeProp->mWordId; + mDepth = dicNodeProp->mDepth; + mLeavingDepth = dicNodeProp->mLeavingDepth; + const WordIdArrayView prevWordIdArrayView = dicNodeProp->getPrevWordIds(); + prevWordIdArrayView.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIdArrayView.size(); + } + + // Init as passing child + void init(const DicNodeProperties *const dicNodeProp, const int codePoint) { + mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; + mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child + mWordId = dicNodeProp->mWordId; + mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child + mLeavingDepth = dicNodeProp->mLeavingDepth; + const WordIdArrayView prevWordIdArrayView = dicNodeProp->getPrevWordIds(); + prevWordIdArrayView.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIdArrayView.size(); + } + + int getChildrenPtNodeArrayPos() const { + return mChildrenPtNodeArrayPos; + } + + int getDicNodeCodePoint() const { + return mDicNodeCodePoint; + } + + uint16_t getDepth() const { + return mDepth; + } + + // TODO: Move to output? + uint16_t getLeavingDepth() const { + return mLeavingDepth; + } + + bool isTerminal() const { + return mWordId != NOT_A_WORD_ID; + } + + bool hasChildren() const { + return (mChildrenPtNodeArrayPos != NOT_A_DICT_POS) || mDepth != mLeavingDepth; + } + + const WordIdArrayView getPrevWordIds() const { + return WordIdArrayView::fromArray(mPrevWordIds).limit(mPrevWordCount); + } + + int getWordId() const { + return mWordId; + } + + private: + // Caution!!! + // Use a default copy constructor and an assign operator because shallow copies are ok + // for this class + int mChildrenPtNodeArrayPos; + int mDicNodeCodePoint; + int mWordId; + uint16_t mDepth; + uint16_t mLeavingDepth; + WordIdArray mPrevWordIds; + size_t mPrevWordCount; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_PROPERTIES_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state.h b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state.h new file mode 100644 index 000000000..badb1f5f2 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state.h @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_H +#define LATINIME_DIC_NODE_STATE_H + +#include "defines.h" +#include "suggest/core/dicnode/internal/dic_node_state_input.h" +#include "suggest/core/dicnode/internal/dic_node_state_output.h" +#include "suggest/core/dicnode/internal/dic_node_state_scoring.h" + +namespace latinime { + +class DicNodeState { + public: + DicNodeStateInput mDicNodeStateInput; + DicNodeStateOutput mDicNodeStateOutput; + DicNodeStateScoring mDicNodeStateScoring; + + AK_FORCE_INLINE DicNodeState() + : mDicNodeStateInput(), mDicNodeStateOutput(), mDicNodeStateScoring() {} + + ~DicNodeState() {} + + DicNodeState &operator=(const DicNodeState& src) { + initByCopy(&src); + return *this; + } + + DicNodeState(const DicNodeState& src) + : mDicNodeStateInput(), mDicNodeStateOutput(), mDicNodeStateScoring() { + initByCopy(&src); + } + + // Init for root + void init() { + mDicNodeStateInput.init(); + mDicNodeStateOutput.init(); + mDicNodeStateScoring.init(); + } + + // Init with previous word. + void initAsRootWithPreviousWord(const DicNodeState *prevWordDicNodeState, + const int prevWordCodePointCount) { + mDicNodeStateOutput.init(&prevWordDicNodeState->mDicNodeStateOutput); + mDicNodeStateInput.init( + &prevWordDicNodeState->mDicNodeStateInput, true /* resetTerminalDiffCost */); + mDicNodeStateScoring.initByCopy(&prevWordDicNodeState->mDicNodeStateScoring); + } + + // Init by copy + AK_FORCE_INLINE void initByCopy(const DicNodeState *const src) { + mDicNodeStateInput.initByCopy(&src->mDicNodeStateInput); + mDicNodeStateOutput.initByCopy(&src->mDicNodeStateOutput); + mDicNodeStateScoring.initByCopy(&src->mDicNodeStateScoring); + } + + // Init by copy and adding merged node code points. + void init(const DicNodeState *const src, const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { + initByCopy(src); + mDicNodeStateOutput.addMergedNodeCodePoints( + mergedNodeCodePointCount, mergedNodeCodePoints); + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_input.h b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_input.h new file mode 100644 index 000000000..50a37ba3e --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_input.h @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_INPUT_H +#define LATINIME_DIC_NODE_STATE_INPUT_H + +#include "defines.h" + +namespace latinime { + +// TODO: Have a .cpp for this class +class DicNodeStateInput { + public: + DicNodeStateInput() {} + ~DicNodeStateInput() {} + + void init() { + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + // TODO: The initial value for mInputIndex should be -1? + //mInputIndex[i] = i == 0 ? 0 : -1; + mInputIndex[i] = 0; + mPrevCodePoint[i] = NOT_A_CODE_POINT; + mTerminalDiffCost[i] = static_cast(MAX_VALUE_FOR_WEIGHTING); + } + } + + void init(const DicNodeStateInput *const src, const bool resetTerminalDiffCost) { + for (int i = 0; i < MAX_POINTER_COUNT_G; i++) { + mInputIndex[i] = src->mInputIndex[i]; + mPrevCodePoint[i] = src->mPrevCodePoint[i]; + mTerminalDiffCost[i] = resetTerminalDiffCost ? + static_cast(MAX_VALUE_FOR_WEIGHTING) : src->mTerminalDiffCost[i]; + } + } + + void updateInputIndexG(const int pointerId, const int inputIndex, + const int prevCodePoint, const float terminalDiffCost, const float rawLength) { + mInputIndex[pointerId] = inputIndex; + mPrevCodePoint[pointerId] = prevCodePoint; + mTerminalDiffCost[pointerId] = terminalDiffCost; + } + + void initByCopy(const DicNodeStateInput *const src) { + init(src, false); + } + + // For transposition + void setPrevCodePoint(const int pointerId, const int c) { + mPrevCodePoint[pointerId] = c; + } + + void forwardInputIndex(const int pointerId, const int val) { + if (mInputIndex[pointerId] < 0) { + mInputIndex[pointerId] = val; + } else { + mInputIndex[pointerId] = mInputIndex[pointerId] + val; + } + } + + int getInputIndex(const int pointerId) const { + // when "inputIndex" exceeds "inputSize", auto-completion needs to be done + return mInputIndex[pointerId]; + } + + int getPrevCodePoint(const int pointerId) const { + return mPrevCodePoint[pointerId]; + } + + float getTerminalDiffCost(const int pointerId) const { + return mTerminalDiffCost[pointerId]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeStateInput); + + int mInputIndex[MAX_POINTER_COUNT_G]; + int mPrevCodePoint[MAX_POINTER_COUNT_G]; + float mTerminalDiffCost[MAX_POINTER_COUNT_G]; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_INPUT_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h new file mode 100644 index 000000000..69a886f55 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_OUTPUT_H +#define LATINIME_DIC_NODE_STATE_OUTPUT_H + +#include +#include +#include // for memmove() + +#include "defines.h" + +namespace latinime { + +// Class to have information to be output. This can contain previous words when the suggestion +// is a multi-word suggestion. +class DicNodeStateOutput { + public: + DicNodeStateOutput() + : mOutputtedCodePointCount(0), mCurrentWordStart(0), mPrevWordCount(0), + mPrevWordsLength(0), mPrevWordStart(0), mSecondWordFirstInputIndex(NOT_AN_INDEX) {} + + ~DicNodeStateOutput() {} + + // Init for root + void init() { + mOutputtedCodePointCount = 0; + mCurrentWordStart = 0; + mOutputCodePoints[0] = 0; + mPrevWordCount = 0; + mPrevWordsLength = 0; + mPrevWordStart = 0; + mSecondWordFirstInputIndex = NOT_AN_INDEX; + } + + // Init for next word. + void init(const DicNodeStateOutput *const stateOutput) { + mOutputtedCodePointCount = stateOutput->mOutputtedCodePointCount + 1; + memmove(mOutputCodePoints, stateOutput->mOutputCodePoints, + stateOutput->mOutputtedCodePointCount * sizeof(mOutputCodePoints[0])); + mOutputCodePoints[stateOutput->mOutputtedCodePointCount] = KEYCODE_SPACE; + mCurrentWordStart = stateOutput->mOutputtedCodePointCount + 1; + mPrevWordCount = std::min(static_cast(stateOutput->mPrevWordCount + 1), + static_cast(MAX_RESULTS)); + mPrevWordsLength = stateOutput->mOutputtedCodePointCount + 1; + mPrevWordStart = stateOutput->mCurrentWordStart; + mSecondWordFirstInputIndex = stateOutput->mSecondWordFirstInputIndex; + } + + void initByCopy(const DicNodeStateOutput *const stateOutput) { + memmove(mOutputCodePoints, stateOutput->mOutputCodePoints, + stateOutput->mOutputtedCodePointCount * sizeof(mOutputCodePoints[0])); + mOutputtedCodePointCount = stateOutput->mOutputtedCodePointCount; + if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { + mOutputCodePoints[mOutputtedCodePointCount] = 0; + } + mCurrentWordStart = stateOutput->mCurrentWordStart; + mPrevWordCount = stateOutput->mPrevWordCount; + mPrevWordsLength = stateOutput->mPrevWordsLength; + mPrevWordStart = stateOutput->mPrevWordStart; + mSecondWordFirstInputIndex = stateOutput->mSecondWordFirstInputIndex; + } + + void addMergedNodeCodePoints(const uint16_t mergedNodeCodePointCount, + const int *const mergedNodeCodePoints) { + if (mergedNodeCodePoints) { + const int additionalCodePointCount = std::min( + static_cast(mergedNodeCodePointCount), + MAX_WORD_LENGTH - mOutputtedCodePointCount); + memmove(&mOutputCodePoints[mOutputtedCodePointCount], mergedNodeCodePoints, + additionalCodePointCount * sizeof(mOutputCodePoints[0])); + mOutputtedCodePointCount = static_cast( + mOutputtedCodePointCount + additionalCodePointCount); + if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { + mOutputCodePoints[mOutputtedCodePointCount] = 0; + } + } + } + + int getCurrentWordCodePointAt(const int index) const { + return mOutputCodePoints[mCurrentWordStart + index]; + } + + const int *getCodePointBuf() const { + return mOutputCodePoints; + } + + void setSecondWordFirstInputIndex(const int inputIndex) { + mSecondWordFirstInputIndex = inputIndex; + } + + int getSecondWordFirstInputIndex() const { + return mSecondWordFirstInputIndex; + } + + // TODO: remove + int16_t getPrevWordsLength() const { + return mPrevWordsLength; + } + + int16_t getPrevWordCount() const { + return mPrevWordCount; + } + + int16_t getPrevWordStart() const { + return mPrevWordStart; + } + + int getOutputCodePointAt(const int id) const { + return mOutputCodePoints[id]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeStateOutput); + + // When the DicNode represents "this is a pen": + // mOutputtedCodePointCount is 13, which is total code point count of "this is a pen" including + // spaces. + // mCurrentWordStart indicates the head of "pen", thus it is 10. + // This contains 3 previous words, "this", "is" and "a"; thus, mPrevWordCount is 3. + // mPrevWordsLength is length of "this is a ", which is 10. + // mPrevWordStart is the start index of "a"; thus, it is 8. + // mSecondWordFirstInputIndex is the first input index of "is". + + uint16_t mOutputtedCodePointCount; + int mOutputCodePoints[MAX_WORD_LENGTH]; + int16_t mCurrentWordStart; + // Previous word count in mOutputCodePoints. + int16_t mPrevWordCount; + // Total length of previous words in mOutputCodePoints. This is being used by the algorithm + // that may want to look at the previous word information. + int16_t mPrevWordsLength; + // Start index of the previous word in mOutputCodePoints. This is being used for auto commit. + int16_t mPrevWordStart; + int mSecondWordFirstInputIndex; +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_OUTPUT_H diff --git a/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h new file mode 100644 index 000000000..3a54c2599 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_NODE_STATE_SCORING_H +#define LATINIME_DIC_NODE_STATE_SCORING_H + +#include +#include + +#include "defines.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" + +namespace latinime { + +class DicNodeStateScoring { + public: + AK_FORCE_INLINE DicNodeStateScoring() + : mDoubleLetterLevel(NOT_A_DOUBLE_LETTER), + mDigraphIndex(DigraphUtils::NOT_A_DIGRAPH_INDEX), + mEditCorrectionCount(0), mProximityCorrectionCount(0), mCompletionCount(0), + mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f), + mRawLength(0.0f), mContainedErrorTypes(ErrorTypeUtils::NOT_AN_ERROR), + mNormalizedCompoundDistanceAfterFirstWord(MAX_VALUE_FOR_WEIGHTING) { + } + + ~DicNodeStateScoring() {} + + void init() { + mEditCorrectionCount = 0; + mProximityCorrectionCount = 0; + mCompletionCount = 0; + mNormalizedCompoundDistance = 0.0f; + mSpatialDistance = 0.0f; + mLanguageDistance = 0.0f; + mRawLength = 0.0f; + mDoubleLetterLevel = NOT_A_DOUBLE_LETTER; + mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX; + mNormalizedCompoundDistanceAfterFirstWord = MAX_VALUE_FOR_WEIGHTING; + mContainedErrorTypes = ErrorTypeUtils::NOT_AN_ERROR; + } + + AK_FORCE_INLINE void initByCopy(const DicNodeStateScoring *const scoring) { + mEditCorrectionCount = scoring->mEditCorrectionCount; + mProximityCorrectionCount = scoring->mProximityCorrectionCount; + mCompletionCount = scoring->mCompletionCount; + mNormalizedCompoundDistance = scoring->mNormalizedCompoundDistance; + mSpatialDistance = scoring->mSpatialDistance; + mLanguageDistance = scoring->mLanguageDistance; + mRawLength = scoring->mRawLength; + mDoubleLetterLevel = scoring->mDoubleLetterLevel; + mDigraphIndex = scoring->mDigraphIndex; + mContainedErrorTypes = scoring->mContainedErrorTypes; + mNormalizedCompoundDistanceAfterFirstWord = + scoring->mNormalizedCompoundDistanceAfterFirstWord; + } + + void addCost(const float spatialCost, const float languageCost, const bool doNormalization, + const int inputSize, const int totalInputIndex, + const ErrorTypeUtils::ErrorType errorType) { + addDistance(spatialCost, languageCost, doNormalization, inputSize, totalInputIndex); + mContainedErrorTypes = mContainedErrorTypes | errorType; + if (ErrorTypeUtils::isEditCorrectionError(errorType)) { + ++mEditCorrectionCount; + } + if (ErrorTypeUtils::isProximityCorrectionError(errorType)) { + ++mProximityCorrectionCount; + } + if (ErrorTypeUtils::isCompletion(errorType)) { + ++mCompletionCount; + } + } + + // Saves the current normalized distance for space-aware gestures. + // See getNormalizedCompoundDistanceAfterFirstWord for details. + void saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet() { + // We get called here after each word. We only want to store the distance after + // the first word, so if we already have a distance we skip saving -- hence "IfNoneYet" + // in the method name. + if (mNormalizedCompoundDistanceAfterFirstWord >= MAX_VALUE_FOR_WEIGHTING) { + mNormalizedCompoundDistanceAfterFirstWord = getNormalizedCompoundDistance(); + } + } + + void addRawLength(const float rawLength) { + mRawLength += rawLength; + } + + float getCompoundDistance() const { + return getCompoundDistance(1.0f); + } + + float getCompoundDistance( + const float weightOfLangModelVsSpatialModel) const { + return mSpatialDistance + + mLanguageDistance * weightOfLangModelVsSpatialModel; + } + + float getNormalizedCompoundDistance() const { + return mNormalizedCompoundDistance; + } + + // For space-aware gestures, we store the normalized distance at the char index + // that ends the first word of the suggestion. We call this the distance after + // first word. + float getNormalizedCompoundDistanceAfterFirstWord() const { + return mNormalizedCompoundDistanceAfterFirstWord; + } + + float getSpatialDistance() const { + return mSpatialDistance; + } + + float getLanguageDistance() const { + return mLanguageDistance; + } + + int16_t getEditCorrectionCount() const { + return mEditCorrectionCount; + } + + int16_t getProximityCorrectionCount() const { + return mProximityCorrectionCount; + } + + int16_t getCompletionCount() const { + return mCompletionCount; + } + + float getRawLength() const { + return mRawLength; + } + + DoubleLetterLevel getDoubleLetterLevel() const { + return mDoubleLetterLevel; + } + + void setDoubleLetterLevel(DoubleLetterLevel doubleLetterLevel) { + switch(doubleLetterLevel) { + case NOT_A_DOUBLE_LETTER: + break; + case A_DOUBLE_LETTER: + if (mDoubleLetterLevel != A_STRONG_DOUBLE_LETTER) { + mDoubleLetterLevel = doubleLetterLevel; + } + break; + case A_STRONG_DOUBLE_LETTER: + mDoubleLetterLevel = doubleLetterLevel; + break; + } + } + + DigraphUtils::DigraphCodePointIndex getDigraphIndex() const { + return mDigraphIndex; + } + + void advanceDigraphIndex() { + switch(mDigraphIndex) { + case DigraphUtils::NOT_A_DIGRAPH_INDEX: + mDigraphIndex = DigraphUtils::FIRST_DIGRAPH_CODEPOINT; + break; + case DigraphUtils::FIRST_DIGRAPH_CODEPOINT: + mDigraphIndex = DigraphUtils::SECOND_DIGRAPH_CODEPOINT; + break; + case DigraphUtils::SECOND_DIGRAPH_CODEPOINT: + mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX; + break; + } + } + + ErrorTypeUtils::ErrorType getContainedErrorTypes() const { + return mContainedErrorTypes; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DicNodeStateScoring); + + DoubleLetterLevel mDoubleLetterLevel; + DigraphUtils::DigraphCodePointIndex mDigraphIndex; + + int16_t mEditCorrectionCount; + int16_t mProximityCorrectionCount; + int16_t mCompletionCount; + + float mNormalizedCompoundDistance; + float mSpatialDistance; + float mLanguageDistance; + float mRawLength; + // All accumulated error types so far + ErrorTypeUtils::ErrorType mContainedErrorTypes; + float mNormalizedCompoundDistanceAfterFirstWord; + + AK_FORCE_INLINE void addDistance(float spatialDistance, float languageDistance, + bool doNormalization, int inputSize, int totalInputIndex) { + mSpatialDistance += spatialDistance; + mLanguageDistance += languageDistance; + if (!doNormalization) { + mNormalizedCompoundDistance = mSpatialDistance + mLanguageDistance; + } else { + mNormalizedCompoundDistance = (mSpatialDistance + mLanguageDistance) + / static_cast(std::max(1, totalInputIndex)); + } + } +}; +} // namespace latinime +#endif // LATINIME_DIC_NODE_STATE_SCORING_H diff --git a/app/src/main/jni/src/suggest/core/dictionary/dictionary.cpp b/app/src/main/jni/src/suggest/core/dictionary/dictionary.cpp new file mode 100644 index 000000000..5c9a1392e --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/dictionary.cpp @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2009, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: dictionary.cpp" + +#include "suggest/core/dictionary/dictionary.h" + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/property/ngram_context.h" +#include "suggest/core/dictionary/dictionary_utils.h" +#include "suggest/core/result/suggestion_results.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest.h" +#include "suggest/core/suggest_options.h" +#include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" +#include "suggest/policyimpl/typing/typing_suggest_policy_factory.h" +#include "utils/int_array_view.h" +#include "utils/log_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +const int Dictionary::HEADER_ATTRIBUTE_BUFFER_SIZE = 32; + +Dictionary::Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr + dictionaryStructureWithBufferPolicy) + : mDictionaryStructureWithBufferPolicy(std::move(dictionaryStructureWithBufferPolicy)), + mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())), + mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) { + logDictionaryInfo(env); +} + +void Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, + int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, + int inputSize, const NgramContext *const ngramContext, + const SuggestOptions *const suggestOptions, const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const { + TimeKeeper::setCurrentTime(); + traverseSession->init(this, ngramContext, suggestOptions); + const auto &suggest = suggestOptions->isGesture() ? mGestureSuggest : mTypingSuggest; + suggest->getSuggestions(proximityInfo, traverseSession, xcoordinates, + ycoordinates, times, pointerIds, inputCodePoints, inputSize, + weightOfLangModelVsSpatialModel, outSuggestionResults); +} + +Dictionary::NgramListenerForPrediction::NgramListenerForPrediction( + const NgramContext *const ngramContext, const WordIdArrayView prevWordIds, + SuggestionResults *const suggestionResults, + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) + : mNgramContext(ngramContext), mPrevWordIds(prevWordIds), + mSuggestionResults(suggestionResults), mDictStructurePolicy(dictStructurePolicy) {} + +void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbability, + const int targetWordId) { + if (targetWordId == NOT_A_WORD_ID) { + return; + } + if (mNgramContext->isNthPrevWordBeginningOfSentence(1 /* n */) + && ngramProbability == NOT_A_PROBABILITY) { + return; + } + int targetWordCodePoints[MAX_WORD_LENGTH]; + const int codePointCount = mDictStructurePolicy->getCodePointsAndReturnCodePointCount( + targetWordId, MAX_WORD_LENGTH, targetWordCodePoints); + if (codePointCount <= 0) { + return; + } + const WordAttributes wordAttributes = mDictStructurePolicy->getWordAttributesInContext( + mPrevWordIds, targetWordId, nullptr /* multiBigramMap */); + if (wordAttributes.getProbability() == NOT_A_PROBABILITY) { + return; + } + mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, + wordAttributes.getProbability()); +} + +void Dictionary::getPredictions(const NgramContext *const ngramContext, + SuggestionResults *const outSuggestionResults) const { + TimeKeeper::setCurrentTime(); + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds( + mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray, + true /* tryLowerCaseSearch */); + NgramListenerForPrediction listener(ngramContext, prevWordIds, outSuggestionResults, + mDictionaryStructureWithBufferPolicy.get()); + mDictionaryStructureWithBufferPolicy->iterateNgramEntries(prevWordIds, &listener); +} + +int Dictionary::getProbability(const CodePointArrayView codePoints) const { + return getNgramProbability(nullptr /* ngramContext */, codePoints); +} + +int Dictionary::getMaxProbabilityOfExactMatches(const CodePointArrayView codePoints) const { + TimeKeeper::setCurrentTime(); + return DictionaryUtils::getMaxProbabilityOfExactMatches( + mDictionaryStructureWithBufferPolicy.get(), codePoints); +} + +int Dictionary::getNgramProbability(const NgramContext *const ngramContext, + const CodePointArrayView codePoints) const { + TimeKeeper::setCurrentTime(); + const int wordId = mDictionaryStructureWithBufferPolicy->getWordId(codePoints, + false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) return NOT_A_PROBABILITY; + if (!ngramContext) { + return getDictionaryStructurePolicy()->getProbabilityOfWord(WordIdArrayView(), wordId); + } + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds( + mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray, + true /* tryLowerCaseSearch */); + return getDictionaryStructurePolicy()->getProbabilityOfWord(prevWordIds, wordId); +} + +bool Dictionary::addUnigramEntry(const CodePointArrayView codePoints, + const UnigramProperty *const unigramProperty) { + if (unigramProperty->representsBeginningOfSentence() + && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy() + ->supportsBeginningOfSentence()) { + AKLOGE("The dictionary doesn't support Beginning-of-Sentence."); + return false; + } + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->addUnigramEntry(codePoints, unigramProperty); +} + +bool Dictionary::removeUnigramEntry(const CodePointArrayView codePoints) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints); +} + +bool Dictionary::addNgramEntry(const NgramProperty *const ngramProperty) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->addNgramEntry(ngramProperty); +} + +bool Dictionary::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView codePoints) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->removeNgramEntry(ngramContext, codePoints); +} + +bool Dictionary::updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView codePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->updateEntriesForWordWithNgramContext(ngramContext, + codePoints, isValidWord, historicalInfo); +} + +bool Dictionary::flush(const char *const filePath) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->flush(filePath); +} + +bool Dictionary::flushWithGC(const char *const filePath) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->flushWithGC(filePath); +} + +bool Dictionary::needsToRunGC(const bool mindsBlockByGC) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->needsToRunGC(mindsBlockByGC); +} + +void Dictionary::getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->getProperty(query, queryLength, outResult, + maxResultLength); +} + +const WordProperty Dictionary::getWordProperty(const CodePointArrayView codePoints) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->getWordProperty(codePoints); +} + +int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->getNextWordAndNextToken( + token, outCodePoints, outCodePointCount); +} + +void Dictionary::logDictionaryInfo(JNIEnv *const env) const { + int dictionaryIdCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + int versionStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + int dateStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + const DictionaryHeaderStructurePolicy *const headerPolicy = + getDictionaryStructurePolicy()->getHeaderStructurePolicy(); + headerPolicy->readHeaderValueOrQuestionMark("dictionary", dictionaryIdCodePointBuffer, + HEADER_ATTRIBUTE_BUFFER_SIZE); + headerPolicy->readHeaderValueOrQuestionMark("version", versionStringCodePointBuffer, + HEADER_ATTRIBUTE_BUFFER_SIZE); + headerPolicy->readHeaderValueOrQuestionMark("date", dateStringCodePointBuffer, + HEADER_ATTRIBUTE_BUFFER_SIZE); + + char dictionaryIdCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + char versionStringCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + char dateStringCharBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; + intArrayToCharArray(dictionaryIdCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE, + dictionaryIdCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE); + intArrayToCharArray(versionStringCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE, + versionStringCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE); + intArrayToCharArray(dateStringCodePointBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE, + dateStringCharBuffer, HEADER_ATTRIBUTE_BUFFER_SIZE); + + LogUtils::logToJava(env, + "Dictionary info: dictionary = %s ; version = %s ; date = %s", + dictionaryIdCharBuffer, versionStringCharBuffer, dateStringCharBuffer); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/dictionary/dictionary.h b/app/src/main/jni/src/suggest/core/dictionary/dictionary.h new file mode 100644 index 000000000..9e224ebfb --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/dictionary.h @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_H +#define LATINIME_DICTIONARY_H + +#include + +#include "defines.h" +#include "jni.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/word_property.h" +#include "suggest/core/suggest_interface.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; +class DicTraverseSession; +class NgramContext; +class ProximityInfo; +class SuggestionResults; +class SuggestOptions; + +class Dictionary { + public: + // Taken from SuggestedWords.java + static const int KIND_MASK_KIND = 0xFF; // Mask to get only the kind + static const int KIND_TYPED = 0; // What user typed + static const int KIND_CORRECTION = 1; // Simple correction/suggestion + static const int KIND_COMPLETION = 2; // Completion (suggestion with appended chars) + static const int KIND_WHITELIST = 3; // Whitelisted word + static const int KIND_BLACKLIST = 4; // Blacklisted word + static const int KIND_HARDCODED = 5; // Hardcoded suggestion, e.g. punctuation + static const int KIND_APP_DEFINED = 6; // Suggested by the application + static const int KIND_SHORTCUT = 7; // A shortcut + static const int KIND_PREDICTION = 8; // A prediction (== a suggestion with no input) + // KIND_RESUMED: A resumed suggestion (comes from a span, currently this type is used only + // in java for re-correction) + static const int KIND_RESUMED = 9; + static const int KIND_OOV_CORRECTION = 10; // Most probable string correction + + static const int KIND_MASK_FLAGS = 0xFFFFFF00; // Mask to get the flags + static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; + static const int KIND_FLAG_EXACT_MATCH = 0x40000000; + static const int KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = 0x20000000; + static const int KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION = 0x10000000; + + Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr + dictionaryStructureWithBufferPolicy); + + void getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, + int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, + int inputSize, const NgramContext *const ngramContext, + const SuggestOptions *const suggestOptions, const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const; + + void getPredictions(const NgramContext *const ngramContext, + SuggestionResults *const outSuggestionResults) const; + + int getProbability(const CodePointArrayView codePoints) const; + + int getMaxProbabilityOfExactMatches(const CodePointArrayView codePoints) const; + + int getNgramProbability(const NgramContext *const ngramContext, + const CodePointArrayView codePoints) const; + + bool addUnigramEntry(const CodePointArrayView codePoints, + const UnigramProperty *const unigramProperty); + + bool removeUnigramEntry(const CodePointArrayView codePoints); + + bool addNgramEntry(const NgramProperty *const ngramProperty); + + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView codePoints); + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView codePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); + + bool flush(const char *const filePath); + + bool flushWithGC(const char *const filePath); + + bool needsToRunGC(const bool mindsBlockByGC); + + void getProperty(const char *const query, const int queryLength, char *const outResult, + const int maxResultLength); + + const WordProperty getWordProperty(const CodePointArrayView codePoints); + + // Method to iterate all words in the dictionary. + // The returned token has to be used to get the next word. If token is 0, this method newly + // starts iterating the dictionary. + int getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount); + + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { + return mDictionaryStructureWithBufferPolicy.get(); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary); + + typedef std::unique_ptr SuggestInterfacePtr; + + class NgramListenerForPrediction : public NgramListener { + public: + NgramListenerForPrediction(const NgramContext *const ngramContext, + const WordIdArrayView prevWordIds, SuggestionResults *const suggestionResults, + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy); + virtual void onVisitEntry(const int ngramProbability, const int targetWordId); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(NgramListenerForPrediction); + + const NgramContext *const mNgramContext; + const WordIdArrayView mPrevWordIds; + SuggestionResults *const mSuggestionResults; + const DictionaryStructureWithBufferPolicy *const mDictStructurePolicy; + }; + + static const int HEADER_ATTRIBUTE_BUFFER_SIZE; + + const DictionaryStructureWithBufferPolicy::StructurePolicyPtr + mDictionaryStructureWithBufferPolicy; + const SuggestInterfacePtr mGestureSuggest; + const SuggestInterfacePtr mTypingSuggest; + + void logDictionaryInfo(JNIEnv *const env) const; +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_H diff --git a/app/src/main/jni/src/suggest/core/dictionary/dictionary_utils.cpp b/app/src/main/jni/src/suggest/core/dictionary/dictionary_utils.cpp new file mode 100644 index 000000000..7de550026 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/dictionary_utils.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/dictionary_utils.h" + +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/ngram_context.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "utils/int_array_view.h" + +namespace latinime { + +/* static */ int DictionaryUtils::getMaxProbabilityOfExactMatches( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const CodePointArrayView codePoints) { + std::vector current; + std::vector next; + + // No ngram context. + NgramContext emptyNgramContext; + WordIdArray prevWordIdArray; + const WordIdArrayView prevWordIds = emptyNgramContext.getPrevWordIds( + dictionaryStructurePolicy, &prevWordIdArray, false /* tryLowerCaseSearch */); + current.emplace_back(); + DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordIds, ¤t.front()); + for (const int codePoint : codePoints) { + // The base-lower input is used to ignore case errors and accent errors. + const int baseLowerCodePoint = CharUtils::toBaseLowerCase(codePoint); + for (const DicNode &dicNode : current) { + if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == baseLowerCodePoint) { + next.emplace_back(dicNode); + next.back().advanceDigraphIndex(); + continue; + } + processChildDicNodes(dictionaryStructurePolicy, baseLowerCodePoint, &dicNode, &next); + } + current.clear(); + current.swap(next); + } + + int maxProbability = NOT_A_PROBABILITY; + for (const DicNode &dicNode : current) { + if (!dicNode.isTerminalDicNode()) { + continue; + } + const WordAttributes wordAttributes = + dictionaryStructurePolicy->getWordAttributesInContext(dicNode.getPrevWordIds(), + dicNode.getWordId(), nullptr /* multiBigramMap */); + // dicNode can contain case errors, accent errors, intentional omissions or digraphs. + maxProbability = std::max(maxProbability, wordAttributes.getProbability()); + } + return maxProbability; +} + +/* static */ void DictionaryUtils::processChildDicNodes( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int inputCodePoint, const DicNode *const parentDicNode, + std::vector *const outDicNodes) { + DicNodeVector childDicNodes; + DicNodeUtils::getAllChildDicNodes(parentDicNode, dictionaryStructurePolicy, &childDicNodes); + for (int childIndex = 0; childIndex < childDicNodes.getSizeAndLock(); ++childIndex) { + DicNode *const childDicNode = childDicNodes[childIndex]; + const int codePoint = CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint()); + if (inputCodePoint == codePoint) { + outDicNodes->emplace_back(*childDicNode); + } + if (childDicNode->canBeIntentionalOmission()) { + processChildDicNodes(dictionaryStructurePolicy, inputCodePoint, childDicNode, + outDicNodes); + } + if (DigraphUtils::hasDigraphForCodePoint( + dictionaryStructurePolicy->getHeaderStructurePolicy(), + childDicNode->getNodeCodePoint())) { + childDicNode->advanceDigraphIndex(); + if (childDicNode->getNodeCodePoint() == codePoint) { + childDicNode->advanceDigraphIndex(); + outDicNodes->emplace_back(*childDicNode); + } + } + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/dictionary/dictionary_utils.h b/app/src/main/jni/src/suggest/core/dictionary/dictionary_utils.h new file mode 100644 index 000000000..4dd21c9be --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/dictionary_utils.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_UTILS_H +#define LATINIME_DICTIONARY_UTILS_H + +#include + +#include "defines.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; +class DicNode; + +class DictionaryUtils { + public: + static int getMaxProbabilityOfExactMatches( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const CodePointArrayView codePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryUtils); + + static void processChildDicNodes( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const int inputCodePoint, const DicNode *const parentDicNode, + std::vector *const outDicNodes); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/dictionary/digraph_utils.cpp b/app/src/main/jni/src/suggest/core/dictionary/digraph_utils.cpp new file mode 100644 index 000000000..4d68f620f --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/digraph_utils.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/digraph_utils.h" + +#include + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +const DigraphUtils::digraph_t DigraphUtils::GERMAN_UMLAUT_DIGRAPHS[] = + { { 'a', 'e', 0x00E4 }, // U+00E4 : LATIN SMALL LETTER A WITH DIAERESIS + { 'o', 'e', 0x00F6 }, // U+00F6 : LATIN SMALL LETTER O WITH DIAERESIS + { 'u', 'e', 0x00FC } }; // U+00FC : LATIN SMALL LETTER U WITH DIAERESIS +const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = + { DIGRAPH_TYPE_GERMAN_UMLAUT }; + +/* static */ bool DigraphUtils::hasDigraphForCodePoint( + const DictionaryHeaderStructurePolicy *const headerPolicy, + const int compositeGlyphCodePoint) { + const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(headerPolicy); + if (DigraphUtils::getDigraphForDigraphTypeAndCodePoint(digraphType, compositeGlyphCodePoint)) { + return true; + } + return false; +} + +// Returns the digraph type associated with the given dictionary. +/* static */ DigraphUtils::DigraphType DigraphUtils::getDigraphTypeForDictionary( + const DictionaryHeaderStructurePolicy *const headerPolicy) { + if (headerPolicy->requiresGermanUmlautProcessing()) { + return DIGRAPH_TYPE_GERMAN_UMLAUT; + } + return DIGRAPH_TYPE_NONE; +} + +// Returns the digraph codepoint for the given composite glyph codepoint and digraph codepoint index +// (which specifies the first or second codepoint in the digraph). +/* static */ int DigraphUtils::getDigraphCodePointForIndex(const int compositeGlyphCodePoint, + const DigraphCodePointIndex digraphCodePointIndex) { + if (digraphCodePointIndex == NOT_A_DIGRAPH_INDEX) { + return NOT_A_CODE_POINT; + } + const DigraphUtils::digraph_t *const digraph = + DigraphUtils::getDigraphForCodePoint(compositeGlyphCodePoint); + if (!digraph) { + return NOT_A_CODE_POINT; + } + if (digraphCodePointIndex == FIRST_DIGRAPH_CODEPOINT) { + return digraph->first; + } else if (digraphCodePointIndex == SECOND_DIGRAPH_CODEPOINT) { + return digraph->second; + } + ASSERT(false); + return NOT_A_CODE_POINT; +} + +// Retrieves the set of all digraphs associated with the given digraph type. +// Returns the size of the digraph array, or 0 if none exist. +/* static */ int DigraphUtils::getAllDigraphsForDigraphTypeAndReturnSize( + const DigraphUtils::DigraphType digraphType, + const DigraphUtils::digraph_t **const digraphs) { + if (digraphType == DigraphUtils::DIGRAPH_TYPE_GERMAN_UMLAUT) { + *digraphs = GERMAN_UMLAUT_DIGRAPHS; + return NELEMS(GERMAN_UMLAUT_DIGRAPHS); + } + return 0; +} + +/** + * Returns the digraph for the input composite glyph codepoint, or nullptr if none exists. + * compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint. + */ +/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForCodePoint( + const int compositeGlyphCodePoint) { + for (size_t i = 0; i < NELEMS(USED_DIGRAPH_TYPES); i++) { + const DigraphUtils::digraph_t *const digraph = getDigraphForDigraphTypeAndCodePoint( + USED_DIGRAPH_TYPES[i], compositeGlyphCodePoint); + if (digraph) { + return digraph; + } + } + return nullptr; +} + +/** + * Returns the digraph for the input composite glyph codepoint, or nullptr if none exists. + * digraphType: the type of digraphs supported. + * compositeGlyphCodePoint: the method returns the digraph corresponding to this codepoint. + */ +/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForDigraphTypeAndCodePoint( + const DigraphUtils::DigraphType digraphType, const int compositeGlyphCodePoint) { + const DigraphUtils::digraph_t *digraphs = nullptr; + const int compositeGlyphLowerCodePoint = CharUtils::toLowerCase(compositeGlyphCodePoint); + const int digraphsSize = + DigraphUtils::getAllDigraphsForDigraphTypeAndReturnSize(digraphType, &digraphs); + for (int i = 0; i < digraphsSize; i++) { + if (digraphs[i].compositeGlyph == compositeGlyphLowerCodePoint) { + return &digraphs[i]; + } + } + return nullptr; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/dictionary/digraph_utils.h b/app/src/main/jni/src/suggest/core/dictionary/digraph_utils.h new file mode 100644 index 000000000..bec2cd6e2 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/digraph_utils.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DIGRAPH_UTILS_H +#define DIGRAPH_UTILS_H + +#include "defines.h" + +namespace latinime { + +class DictionaryHeaderStructurePolicy; + +class DigraphUtils { + public: + typedef enum { + NOT_A_DIGRAPH_INDEX, + FIRST_DIGRAPH_CODEPOINT, + SECOND_DIGRAPH_CODEPOINT + } DigraphCodePointIndex; + + typedef enum { + DIGRAPH_TYPE_NONE, + DIGRAPH_TYPE_GERMAN_UMLAUT, + } DigraphType; + + typedef struct { int first; int second; int compositeGlyph; } digraph_t; + + static bool hasDigraphForCodePoint(const DictionaryHeaderStructurePolicy *const headerPolicy, + const int compositeGlyphCodePoint); + static int getDigraphCodePointForIndex(const int compositeGlyphCodePoint, + const DigraphCodePointIndex digraphCodePointIndex); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils); + static DigraphType getDigraphTypeForDictionary( + const DictionaryHeaderStructurePolicy *const headerPolicy); + static int getAllDigraphsForDigraphTypeAndReturnSize( + const DigraphType digraphType, const digraph_t **const digraphs); + static const digraph_t *getDigraphForCodePoint(const int compositeGlyphCodePoint); + static const digraph_t *getDigraphForDigraphTypeAndCodePoint( + const DigraphType digraphType, const int compositeGlyphCodePoint); + + static const digraph_t GERMAN_UMLAUT_DIGRAPHS[]; + static const DigraphType USED_DIGRAPH_TYPES[]; +}; +} // namespace latinime +#endif // DIGRAPH_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/dictionary/error_type_utils.cpp b/app/src/main/jni/src/suggest/core/dictionary/error_type_utils.cpp new file mode 100644 index 000000000..61093e174 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/error_type_utils.h" + +namespace latinime { + +const ErrorTypeUtils::ErrorType ErrorTypeUtils::NOT_AN_ERROR = 0x0; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_CASE = 0x1; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT = 0x2; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT = 0x4; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT = 0x8; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x10; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x20; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x40; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x80; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x100; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x200; + +const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = + NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR; + +const ErrorTypeUtils::ErrorType + ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = + ERRORS_TREATED_AS_AN_EXACT_MATCH | INTENTIONAL_OMISSION; + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/dictionary/error_type_utils.h b/app/src/main/jni/src/suggest/core/dictionary/error_type_utils.h new file mode 100644 index 000000000..75111ba75 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/dictionary/error_type_utils.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ERROR_TYPE_UTILS_H +#define LATINIME_ERROR_TYPE_UTILS_H + +#include + +#include "defines.h" + +namespace latinime { + +class ErrorTypeUtils { + public: + // ErrorType is mainly decided by CorrectionType but it is also depending on if + // the correction has really been performed or not. + typedef uint32_t ErrorType; + + static const ErrorType NOT_AN_ERROR; + static const ErrorType MATCH_WITH_WRONG_CASE; + static const ErrorType MATCH_WITH_MISSING_ACCENT; + static const ErrorType MATCH_WITH_MISSING_EXPLICIT_ACCENT; + static const ErrorType MATCH_WITH_WRONG_ACCENT; + static const ErrorType MATCH_WITH_DIGRAPH; + // Treat error as an intentional omission when the CorrectionType is omission and the node can + // be intentional omission. + static const ErrorType INTENTIONAL_OMISSION; + // Substitution, omission and transposition + static const ErrorType EDIT_CORRECTION; + // Proximity error + static const ErrorType PROXIMITY_CORRECTION; + // Completion + static const ErrorType COMPLETION; + // New word + // TODO: Remove. + // A new word error should be an edit correction error or a proximity correction error. + static const ErrorType NEW_WORD; + + static bool isExactMatch(const ErrorType containedErrorTypes) { + return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; + } + + static bool isPerfectMatch(const ErrorType containedErrorTypes) { + return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0; + } + + static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) { + return (containedErrorTypes + & ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0; + } + + static bool isMissingExplicitAccent(const ErrorType errorType) { + return (errorType & MATCH_WITH_MISSING_EXPLICIT_ACCENT) != 0; + } + + static bool isEditCorrectionError(const ErrorType errorType) { + return (errorType & EDIT_CORRECTION) != 0; + } + + static bool isProximityCorrectionError(const ErrorType errorType) { + return (errorType & PROXIMITY_CORRECTION) != 0; + } + + static bool isCompletion(const ErrorType errorType) { + return (errorType & COMPLETION) != 0; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); + + static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; + static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH; + static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION; +}; +} // namespace latinime +#endif // LATINIME_ERROR_TYPE_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/layout/additional_proximity_chars.cpp b/app/src/main/jni/src/suggest/core/layout/additional_proximity_chars.cpp new file mode 100644 index 000000000..8b39f7da5 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/additional_proximity_chars.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/layout/additional_proximity_chars.h" + +namespace latinime { +// TODO: Stop using hardcoded additional proximity characters. +// TODO: Have proximity character informations in each language's binary dictionary. +const int AdditionalProximityChars::LOCALE_EN_US[LOCALE_EN_US_SIZE] = { 'e', 'n' }; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_A[EN_US_ADDITIONAL_A_SIZE] = { + 'e', 'i', 'o', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_E[EN_US_ADDITIONAL_E_SIZE] = { + 'a', 'i', 'o', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_I[EN_US_ADDITIONAL_I_SIZE] = { + 'a', 'e', 'o', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_O[EN_US_ADDITIONAL_O_SIZE] = { + 'a', 'e', 'i', 'u' +}; + +const int AdditionalProximityChars::EN_US_ADDITIONAL_U[EN_US_ADDITIONAL_U_SIZE] = { + 'a', 'e', 'i', 'o' +}; +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/layout/additional_proximity_chars.h b/app/src/main/jni/src/suggest/core/layout/additional_proximity_chars.h new file mode 100644 index 000000000..2260be9bd --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/additional_proximity_chars.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ADDITIONAL_PROXIMITY_CHARS_H +#define LATINIME_ADDITIONAL_PROXIMITY_CHARS_H + +#include +#include + +#include "defines.h" + +namespace latinime { + +class AdditionalProximityChars { + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(AdditionalProximityChars); + static const int LOCALE_EN_US_SIZE = 2; + static const int LOCALE_EN_US[LOCALE_EN_US_SIZE]; + static const int EN_US_ADDITIONAL_A_SIZE = 4; + static const int EN_US_ADDITIONAL_A[]; + static const int EN_US_ADDITIONAL_E_SIZE = 4; + static const int EN_US_ADDITIONAL_E[]; + static const int EN_US_ADDITIONAL_I_SIZE = 4; + static const int EN_US_ADDITIONAL_I[]; + static const int EN_US_ADDITIONAL_O_SIZE = 4; + static const int EN_US_ADDITIONAL_O[]; + static const int EN_US_ADDITIONAL_U_SIZE = 4; + static const int EN_US_ADDITIONAL_U[]; + + AK_FORCE_INLINE static bool isEnLocale(const std::vector *locale) { + const int NCHARS = NELEMS(LOCALE_EN_US); + if (locale->size() < NCHARS) { + return false; + } + for (int i = 0; i < NCHARS; ++i) { + if ((*locale)[i] != LOCALE_EN_US[i]) { + return false; + } + } + return true; + } + + public: + static int getAdditionalCharsSize(const std::vector *locale, const int c) { + if (!isEnLocale(locale)) { + return 0; + } + switch (c) { + case 'a': + return EN_US_ADDITIONAL_A_SIZE; + case 'e': + return EN_US_ADDITIONAL_E_SIZE; + case 'i': + return EN_US_ADDITIONAL_I_SIZE; + case 'o': + return EN_US_ADDITIONAL_O_SIZE; + case 'u': + return EN_US_ADDITIONAL_U_SIZE; + default: + return 0; + } + } + + static const int *getAdditionalChars(const std::vector *locale, const int c) { + if (!isEnLocale(locale)) { + return 0; + } + switch (c) { + case 'a': + return EN_US_ADDITIONAL_A; + case 'e': + return EN_US_ADDITIONAL_E; + case 'i': + return EN_US_ADDITIONAL_I; + case 'o': + return EN_US_ADDITIONAL_O; + case 'u': + return EN_US_ADDITIONAL_U; + default: + return 0; + } + } +}; +} // namespace latinime +#endif // LATINIME_ADDITIONAL_PROXIMITY_CHARS_H diff --git a/app/src/main/jni/src/suggest/core/layout/geometry_utils.h b/app/src/main/jni/src/suggest/core/layout/geometry_utils.h new file mode 100644 index 000000000..000fcd4a1 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/geometry_utils.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_GEOMETRY_UTILS_H +#define LATINIME_GEOMETRY_UTILS_H + +#include + +#include "defines.h" + +#define ROUND_FLOAT_10000(f) ((f) < 1000.0f && (f) > 0.001f) \ + ? (floorf((f) * 10000.0f) / 10000.0f) : (f) + +namespace latinime { + +class GeometryUtils { + public: + static inline float SQUARE_FLOAT(const float x) { return x * x; } + + static AK_FORCE_INLINE float getAngle(const int x1, const int y1, const int x2, const int y2) { + const int dx = x1 - x2; + const int dy = y1 - y2; + if (dx == 0 && dy == 0) return 0.0f; + return atan2f(static_cast(dy), static_cast(dx)); + } + + static AK_FORCE_INLINE float getAngleDiff(const float a1, const float a2) { + static const float M_2PI_F = M_PI * 2.0f; + float delta = fabsf(a1 - a2); + if (delta > M_2PI_F) { + delta -= (M_2PI_F * static_cast(delta / M_2PI_F)); + } + if (delta > M_PI_F) { + delta = M_2PI_F - delta; + } + return ROUND_FLOAT_10000(delta); + } + + static AK_FORCE_INLINE int getDistanceInt(const int x1, const int y1, const int x2, + const int y2) { + return static_cast(hypotf(static_cast(x1 - x2), static_cast(y1 - y2))); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(GeometryUtils); +}; +} // namespace latinime +#endif // LATINIME_GEOMETRY_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/layout/normal_distribution.h b/app/src/main/jni/src/suggest/core/layout/normal_distribution.h new file mode 100644 index 000000000..5f21a59c0 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/normal_distribution.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NORMAL_DISTRIBUTION_H +#define LATINIME_NORMAL_DISTRIBUTION_H + +#include + +#include "defines.h" + +namespace latinime { + +// Normal distribution N(u, sigma^2). +class NormalDistribution { + public: + NormalDistribution(const float u, const float sigma) + : mU(u), + mPreComputedNonExpPart(1.0f / sqrtf(2.0f * M_PI_F + * GeometryUtils::SQUARE_FLOAT(sigma))), + mPreComputedExponentPart(-1.0f / (2.0f * GeometryUtils::SQUARE_FLOAT(sigma))) {} + + float getProbabilityDensity(const float x) const { + const float shiftedX = x - mU; + return mPreComputedNonExpPart + * expf(mPreComputedExponentPart * GeometryUtils::SQUARE_FLOAT(shiftedX)); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(NormalDistribution); + + const float mU; // mean value + const float mPreComputedNonExpPart; // = 1 / sqrt(2 * PI * sigma^2) + const float mPreComputedExponentPart; // = -1 / (2 * sigma^2) +}; +} // namespace latinime +#endif // LATINIME_NORMAL_DISTRIBUTION_H diff --git a/app/src/main/jni/src/suggest/core/layout/normal_distribution_2d.h b/app/src/main/jni/src/suggest/core/layout/normal_distribution_2d.h new file mode 100644 index 000000000..3bc0a0153 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/normal_distribution_2d.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NORMAL_DISTRIBUTION_2D_H +#define LATINIME_NORMAL_DISTRIBUTION_2D_H + +#include + +#include "defines.h" +#include "suggest/core/layout/geometry_utils.h" +#include "suggest/core/layout/normal_distribution.h" + +namespace latinime { + +// Normal distribution on a 2D plane. The covariance is always zero, but the distribution can be +// rotated. +class NormalDistribution2D { + public: + NormalDistribution2D(const float uX, const float sigmaX, const float uY, const float sigmaY, + const float theta) + : mXDistribution(0.0f, sigmaX), mYDistribution(0.0f, sigmaY), mUX(uX), mUY(uY), + mSinTheta(sinf(theta)), mCosTheta(cosf(theta)) {} + + float getProbabilityDensity(const float x, const float y) const { + // Shift + const float shiftedX = x - mUX; + const float shiftedY = y - mUY; + // Rotate + const float rotatedShiftedX = mCosTheta * shiftedX + mSinTheta * shiftedY; + const float rotatedShiftedY = -mSinTheta * shiftedX + mCosTheta * shiftedY; + return mXDistribution.getProbabilityDensity(rotatedShiftedX) + * mYDistribution.getProbabilityDensity(rotatedShiftedY); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(NormalDistribution2D); + + const NormalDistribution mXDistribution; + const NormalDistribution mYDistribution; + const float mUX; + const float mUY; + const float mSinTheta; + const float mCosTheta; +}; +} // namespace latinime +#endif // LATINIME_NORMAL_DISTRIBUTION_2D_H diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info.cpp b/app/src/main/jni/src/suggest/core/layout/proximity_info.cpp new file mode 100644 index 000000000..933a5e145 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: proximity_info.cpp" + +#include "suggest/core/layout/proximity_info.h" + +#include +#include +#include + +#include "defines.h" +#include "jni.h" +#include "suggest/core/layout/additional_proximity_chars.h" +#include "suggest/core/layout/geometry_utils.h" +#include "suggest/core/layout/proximity_info_params.h" +#include "utils/char_utils.h" + +namespace latinime { + +static AK_FORCE_INLINE void safeGetOrFillZeroIntArrayRegion(JNIEnv *env, jintArray jArray, + jsize len, jint *buffer) { + if (jArray && buffer) { + env->GetIntArrayRegion(jArray, 0, len, buffer); + } else if (buffer) { + memset(buffer, 0, len * sizeof(buffer[0])); + } +} + +static AK_FORCE_INLINE void safeGetOrFillZeroFloatArrayRegion(JNIEnv *env, jfloatArray jArray, + jsize len, jfloat *buffer) { + if (jArray && buffer) { + env->GetFloatArrayRegion(jArray, 0, len, buffer); + } else if (buffer) { + memset(buffer, 0, len * sizeof(buffer[0])); + } +} + +ProximityInfo::ProximityInfo(JNIEnv *env, const int keyboardWidth, const int keyboardHeight, + const int gridWidth, const int gridHeight, const int mostCommonKeyWidth, + const int mostCommonKeyHeight, const jintArray proximityChars, const int keyCount, + const jintArray keyXCoordinates, const jintArray keyYCoordinates, + const jintArray keyWidths, const jintArray keyHeights, const jintArray keyCharCodes, + const jfloatArray sweetSpotCenterXs, const jfloatArray sweetSpotCenterYs, + const jfloatArray sweetSpotRadii) + : GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight), MOST_COMMON_KEY_WIDTH(mostCommonKeyWidth), + MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth), + NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE(1.0f + + GeometryUtils::SQUARE_FLOAT(static_cast(mostCommonKeyHeight) / + static_cast(mostCommonKeyWidth))), + CELL_WIDTH((keyboardWidth + gridWidth - 1) / gridWidth), + CELL_HEIGHT((keyboardHeight + gridHeight - 1) / gridHeight), + KEY_COUNT(std::min(keyCount, MAX_KEY_COUNT_IN_A_KEYBOARD)), + KEYBOARD_WIDTH(keyboardWidth), KEYBOARD_HEIGHT(keyboardHeight), + KEYBOARD_HYPOTENUSE(hypotf(KEYBOARD_WIDTH, KEYBOARD_HEIGHT)), + HAS_TOUCH_POSITION_CORRECTION_DATA(keyCount > 0 && keyXCoordinates && keyYCoordinates + && keyWidths && keyHeights && keyCharCodes && sweetSpotCenterXs + && sweetSpotCenterYs && sweetSpotRadii), + mProximityCharsArray(new int[GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE + /* proximityCharsLength */]), + mLowerCodePointToKeyMap() { + /* Let's check the input array length here to make sure */ + const jsize proximityCharsLength = env->GetArrayLength(proximityChars); + if (proximityCharsLength != GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE) { + AKLOGE("Invalid proximityCharsLength: %d", proximityCharsLength); + ASSERT(false); + return; + } + if (DEBUG_PROXIMITY_INFO) { + AKLOGI("Create proximity info array %d", proximityCharsLength); + } + safeGetOrFillZeroIntArrayRegion(env, proximityChars, proximityCharsLength, + mProximityCharsArray); + safeGetOrFillZeroIntArrayRegion(env, keyXCoordinates, KEY_COUNT, mKeyXCoordinates); + safeGetOrFillZeroIntArrayRegion(env, keyYCoordinates, KEY_COUNT, mKeyYCoordinates); + safeGetOrFillZeroIntArrayRegion(env, keyWidths, KEY_COUNT, mKeyWidths); + safeGetOrFillZeroIntArrayRegion(env, keyHeights, KEY_COUNT, mKeyHeights); + safeGetOrFillZeroIntArrayRegion(env, keyCharCodes, KEY_COUNT, mKeyCodePoints); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterXs, KEY_COUNT, mSweetSpotCenterXs); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterYs, KEY_COUNT, mSweetSpotCenterYs); + safeGetOrFillZeroFloatArrayRegion(env, sweetSpotRadii, KEY_COUNT, mSweetSpotRadii); + initializeG(); +} + +ProximityInfo::~ProximityInfo() { + delete[] mProximityCharsArray; +} + +bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { + if (x < 0 || y < 0) { + if (DEBUG_DICT) { + AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); + // TODO: Enable this assertion. + //ASSERT(false); + } + return false; + } + + const int startIndex = ProximityInfoUtils::getStartIndexFromCoordinates(x, y, + CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH); + if (DEBUG_PROXIMITY_INFO) { + AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); + } + int *proximityCharsArray = mProximityCharsArray; + for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + if (DEBUG_PROXIMITY_INFO) { + AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]); + } + if (proximityCharsArray[startIndex + i] == KEYCODE_SPACE) { + return true; + } + } + return false; +} + +float ProximityInfo::getNormalizedSquaredDistanceFromCenterFloatG( + const int keyId, const int x, const int y, const bool isGeometric) const { + const float centerX = static_cast(getKeyCenterXOfKeyIdG(keyId, x, isGeometric)); + const float centerY = static_cast(getKeyCenterYOfKeyIdG(keyId, y, isGeometric)); + const float touchX = static_cast(x); + const float touchY = static_cast(y); + return ProximityInfoUtils::getSquaredDistanceFloat(centerX, centerY, touchX, touchY) + / GeometryUtils::SQUARE_FLOAT(static_cast(getMostCommonKeyWidth())); +} + +int ProximityInfo::getCodePointOf(const int keyIndex) const { + if (keyIndex < 0 || keyIndex >= KEY_COUNT) { + return NOT_A_CODE_POINT; + } + return mKeyIndexToLowerCodePointG[keyIndex]; +} + +int ProximityInfo::getOriginalCodePointOf(const int keyIndex) const { + if (keyIndex < 0 || keyIndex >= KEY_COUNT) { + return NOT_A_CODE_POINT; + } + return mKeyIndexToOriginalCodePoint[keyIndex]; +} + +void ProximityInfo::initializeG() { + // TODO: Optimize + for (int i = 0; i < KEY_COUNT; ++i) { + const int code = mKeyCodePoints[i]; + const int lowerCode = CharUtils::toLowerCase(code); + mCenterXsG[i] = mKeyXCoordinates[i] + mKeyWidths[i] / 2; + mCenterYsG[i] = mKeyYCoordinates[i] + mKeyHeights[i] / 2; + if (hasTouchPositionCorrectionData()) { + // Computes sweet spot center points for geometric input. + const float verticalScale = ProximityInfoParams::VERTICAL_SWEET_SPOT_SCALE_G; + const float sweetSpotCenterY = static_cast(mSweetSpotCenterYs[i]); + const float gapY = sweetSpotCenterY - mCenterYsG[i]; + mSweetSpotCenterYsG[i] = static_cast(mCenterYsG[i] + gapY * verticalScale); + } + mLowerCodePointToKeyMap[lowerCode] = i; + mKeyIndexToOriginalCodePoint[i] = code; + mKeyIndexToLowerCodePointG[i] = lowerCode; + } + for (int i = 0; i < KEY_COUNT; i++) { + mKeyKeyDistancesG[i][i] = 0; + for (int j = i + 1; j < KEY_COUNT; j++) { + if (hasTouchPositionCorrectionData()) { + // Computes distances using sweet spots if they exist. + // We have two types of Y coordinate sweet spots, for geometric and for the others. + // The sweet spots for geometric input are used for calculating key-key distances + // here. + mKeyKeyDistancesG[i][j] = GeometryUtils::getDistanceInt( + mSweetSpotCenterXs[i], mSweetSpotCenterYsG[i], + mSweetSpotCenterXs[j], mSweetSpotCenterYsG[j]); + } else { + mKeyKeyDistancesG[i][j] = GeometryUtils::getDistanceInt( + mCenterXsG[i], mCenterYsG[i], mCenterXsG[j], mCenterYsG[j]); + } + mKeyKeyDistancesG[j][i] = mKeyKeyDistancesG[i][j]; + } + } +} + +// referencePointX is used only for keys wider than most common key width. When the referencePointX +// is NOT_A_COORDINATE, this method calculates the return value without using the line segment. +// isGeometric is currently not used because we don't have extra X coordinates sweet spots for +// geometric input. +int ProximityInfo::getKeyCenterXOfKeyIdG( + const int keyId, const int referencePointX, const bool isGeometric) const { + if (keyId < 0) { + return 0; + } + int centerX = (hasTouchPositionCorrectionData()) ? static_cast(mSweetSpotCenterXs[keyId]) + : mCenterXsG[keyId]; + const int keyWidth = mKeyWidths[keyId]; + if (referencePointX != NOT_A_COORDINATE + && keyWidth > getMostCommonKeyWidth()) { + // For keys wider than most common keys, we use a line segment instead of the center point; + // thus, centerX is adjusted depending on referencePointX. + const int keyWidthHalfDiff = (keyWidth - getMostCommonKeyWidth()) / 2; + if (referencePointX < centerX - keyWidthHalfDiff) { + centerX -= keyWidthHalfDiff; + } else if (referencePointX > centerX + keyWidthHalfDiff) { + centerX += keyWidthHalfDiff; + } else { + centerX = referencePointX; + } + } + return centerX; +} + +// When the referencePointY is NOT_A_COORDINATE, this method calculates the return value without +// using the line segment. +int ProximityInfo::getKeyCenterYOfKeyIdG( + const int keyId, const int referencePointY, const bool isGeometric) const { + // TODO: Remove "isGeometric" and have separate "proximity_info"s for gesture and typing. + if (keyId < 0) { + return 0; + } + int centerY; + if (!hasTouchPositionCorrectionData()) { + centerY = mCenterYsG[keyId]; + } else if (isGeometric) { + centerY = static_cast(mSweetSpotCenterYsG[keyId]); + } else { + centerY = static_cast(mSweetSpotCenterYs[keyId]); + } + if (referencePointY != NOT_A_COORDINATE && + centerY + mKeyHeights[keyId] > KEYBOARD_HEIGHT && centerY < referencePointY) { + // When the distance between center point and bottom edge of the keyboard is shorter than + // the key height, we assume the key is located at the bottom row of the keyboard. + // The center point is extended to the bottom edge for such keys. + return referencePointY; + } + return centerY; +} + +int ProximityInfo::getKeyKeyDistanceG(const int keyId0, const int keyId1) const { + if (keyId0 >= 0 && keyId1 >= 0) { + return mKeyKeyDistancesG[keyId0][keyId1]; + } + return MAX_VALUE_FOR_WEIGHTING; +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info.h b/app/src/main/jni/src/suggest/core/layout/proximity_info.h new file mode 100644 index 000000000..f7c907697 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info.h @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_H +#define LATINIME_PROXIMITY_INFO_H + +#include +#include + +#include "defines.h" +#include "jni.h" +#include "suggest/core/layout/proximity_info_utils.h" + +namespace latinime { + +class ProximityInfo { + public: + ProximityInfo(JNIEnv *env, const int keyboardWidth, const int keyboardHeight, + const int gridWidth, const int gridHeight, + const int mostCommonKeyWidth, const int mostCommonKeyHeight, + const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates, + const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights, + const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs, + const jfloatArray sweetSpotCenterYs, const jfloatArray sweetSpotRadii); + ~ProximityInfo(); + bool hasSpaceProximity(const int x, const int y) const; + float getNormalizedSquaredDistanceFromCenterFloatG( + const int keyId, const int x, const int y, const bool isGeometric) const; + int getCodePointOf(const int keyIndex) const; + int getOriginalCodePointOf(const int keyIndex) const; + bool hasSweetSpotData(const int keyIndex) const { + // When there are no calibration data for a key, + // the radius of the key is assigned to zero. + return mSweetSpotRadii[keyIndex] > 0.0f; + } + float getSweetSpotRadiiAt(int keyIndex) const { return mSweetSpotRadii[keyIndex]; } + float getSweetSpotCenterXAt(int keyIndex) const { return mSweetSpotCenterXs[keyIndex]; } + float getSweetSpotCenterYAt(int keyIndex) const { return mSweetSpotCenterYs[keyIndex]; } + bool hasTouchPositionCorrectionData() const { return HAS_TOUCH_POSITION_CORRECTION_DATA; } + int getMostCommonKeyWidth() const { return MOST_COMMON_KEY_WIDTH; } + int getMostCommonKeyWidthSquare() const { return MOST_COMMON_KEY_WIDTH_SQUARE; } + float getNormalizedSquaredMostCommonKeyHypotenuse() const { + return NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE; + } + int getKeyCount() const { return KEY_COUNT; } + int getCellHeight() const { return CELL_HEIGHT; } + int getCellWidth() const { return CELL_WIDTH; } + int getGridWidth() const { return GRID_WIDTH; } + int getGridHeight() const { return GRID_HEIGHT; } + int getKeyboardWidth() const { return KEYBOARD_WIDTH; } + int getKeyboardHeight() const { return KEYBOARD_HEIGHT; } + float getKeyboardHypotenuse() const { return KEYBOARD_HYPOTENUSE; } + + int getKeyCenterXOfKeyIdG( + const int keyId, const int referencePointX, const bool isGeometric) const; + int getKeyCenterYOfKeyIdG( + const int keyId, const int referencePointY, const bool isGeometric) const; + int getKeyKeyDistanceG(int keyId0, int keyId1) const; + + AK_FORCE_INLINE void initializeProximities(const int *const inputCodes, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int inputSize, int *allInputCodes, const std::vector *locale) const { + ProximityInfoUtils::initializeProximities(inputCodes, inputXCoordinates, inputYCoordinates, + inputSize, mKeyXCoordinates, mKeyYCoordinates, mKeyWidths, mKeyHeights, + mProximityCharsArray, CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH, MOST_COMMON_KEY_WIDTH, + KEY_COUNT, locale, &mLowerCodePointToKeyMap, allInputCodes); + } + + AK_FORCE_INLINE int getKeyIndexOf(const int c) const { + return ProximityInfoUtils::getKeyIndexOf(KEY_COUNT, c, &mLowerCodePointToKeyMap); + } + + AK_FORCE_INLINE bool isCodePointOnKeyboard(const int codePoint) const { + return getKeyIndexOf(codePoint) != NOT_AN_INDEX; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfo); + + void initializeG(); + + const int GRID_WIDTH; + const int GRID_HEIGHT; + const int MOST_COMMON_KEY_WIDTH; + const int MOST_COMMON_KEY_WIDTH_SQUARE; + const float NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE; + const int CELL_WIDTH; + const int CELL_HEIGHT; + const int KEY_COUNT; + const int KEYBOARD_WIDTH; + const int KEYBOARD_HEIGHT; + const float KEYBOARD_HYPOTENUSE; + const bool HAS_TOUCH_POSITION_CORRECTION_DATA; + int *mProximityCharsArray; + int mKeyXCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyYCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyWidths[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyHeights[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyCodePoints[MAX_KEY_COUNT_IN_A_KEYBOARD]; + float mSweetSpotCenterXs[MAX_KEY_COUNT_IN_A_KEYBOARD]; + float mSweetSpotCenterYs[MAX_KEY_COUNT_IN_A_KEYBOARD]; + // Sweet spots for geometric input. Note that we have extra sweet spots only for Y coordinates. + float mSweetSpotCenterYsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + float mSweetSpotRadii[MAX_KEY_COUNT_IN_A_KEYBOARD]; + std::unordered_map mLowerCodePointToKeyMap; + int mKeyIndexToOriginalCodePoint[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyIndexToLowerCodePointG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mCenterXsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mCenterYsG[MAX_KEY_COUNT_IN_A_KEYBOARD]; + int mKeyKeyDistancesG[MAX_KEY_COUNT_IN_A_KEYBOARD][MAX_KEY_COUNT_IN_A_KEYBOARD]; +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_H diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info_params.cpp b/app/src/main/jni/src/suggest/core/layout/proximity_info_params.cpp new file mode 100644 index 000000000..68bb0ae9d --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info_params.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "defines.h" +#include "suggest/core/layout/proximity_info_params.h" + +namespace latinime { +const float ProximityInfoParams::NOT_A_DISTANCE_FLOAT = -1.0f; +const int ProximityInfoParams::MIN_DOUBLE_LETTER_BEELINE_SPEED_PERCENTILE = 5; +const float ProximityInfoParams::VERTICAL_SWEET_SPOT_SCALE = 1.0f; +const float ProximityInfoParams::VERTICAL_SWEET_SPOT_SCALE_G = 0.5f; + +/* Per method constants */ +// Used by ProximityInfoStateUtils::updateNearKeysDistances() +const float ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_DISTANCE = 2.0f; + +// Used by ProximityInfoStateUtils::isPrevLocalMin() +const float ProximityInfoParams::MARGIN_FOR_PREV_LOCAL_MIN = 0.01f; + +// Used by ProximityInfoStateUtils::getPointScore() +const int ProximityInfoParams::DISTANCE_BASE_SCALE = 100; +const float ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_POINT_SCORE = 0.6f; +const int ProximityInfoParams::CORNER_CHECK_DISTANCE_THRESHOLD_SCALE = 25; +const float ProximityInfoParams::NOT_LOCALMIN_DISTANCE_SCORE = -1.0f; +const float ProximityInfoParams::LOCALMIN_DISTANCE_AND_NEAR_TO_KEY_SCORE = 1.0f; +const float ProximityInfoParams::CORNER_ANGLE_THRESHOLD_FOR_POINT_SCORE = M_PI_F * 2.0f / 3.0f; +const float ProximityInfoParams::CORNER_SUM_ANGLE_THRESHOLD = M_PI_F / 4.0f; +const float ProximityInfoParams::CORNER_SCORE = 1.0f; + +// Used by ProximityInfoStateUtils::refreshSpeedRates() +const int ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION = 2; + +// Used by ProximityInfoStateUtils::pushTouchPoint() +const int ProximityInfoParams::LAST_POINT_SKIP_DISTANCE_SCALE = 4; + +// Used by ProximityInfoStateUtils::updateAlignPointProbabilities() +const float ProximityInfoParams::MIN_PROBABILITY = 0.000005f; +const float ProximityInfoParams::MAX_SKIP_PROBABILITY = 0.95f; +const float ProximityInfoParams::SKIP_FIRST_POINT_PROBABILITY = 0.01f; +const float ProximityInfoParams::SKIP_LAST_POINT_PROBABILITY = 0.1f; +const float ProximityInfoParams::MIN_SPEED_RATE_FOR_SKIP_PROBABILITY = 0.15f; +const float ProximityInfoParams::SPEED_WEIGHT_FOR_SKIP_PROBABILITY = 0.9f; +const float ProximityInfoParams::SLOW_STRAIGHT_WEIGHT_FOR_SKIP_PROBABILITY = 0.6f; +const float ProximityInfoParams::NEAREST_DISTANCE_WEIGHT = 0.5f; +const float ProximityInfoParams::NEAREST_DISTANCE_BIAS = 0.5f; +const float ProximityInfoParams::NEAREST_DISTANCE_WEIGHT_FOR_LAST = 0.6f; +const float ProximityInfoParams::NEAREST_DISTANCE_BIAS_FOR_LAST = 0.4f; +const float ProximityInfoParams::ANGLE_WEIGHT = 0.90f; +const float ProximityInfoParams::DEEP_CORNER_ANGLE_THRESHOLD = M_PI_F * 60.0f / 180.0f; +const float ProximityInfoParams::SKIP_DEEP_CORNER_PROBABILITY = 0.1f; +const float ProximityInfoParams::CORNER_ANGLE_THRESHOLD = M_PI_F * 30.0f / 180.0f; +const float ProximityInfoParams::STRAIGHT_ANGLE_THRESHOLD = M_PI_F * 15.0f / 180.0f; +const float ProximityInfoParams::SKIP_CORNER_PROBABILITY = 0.4f; +const float ProximityInfoParams::SPEED_MARGIN = 0.1f; +const float ProximityInfoParams::CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION = 0.0f; +// TODO: The variance is critical for accuracy; thus, adjusting these parameters by machine +// learning or something would be efficient. +const float ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION = 0.3f; +const float ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION = 0.25f; +const float ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION = 0.5f; +const float ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION = 0.15f; +const float ProximityInfoParams::MIN_STANDARD_DEVIATION = 0.37f; +const float ProximityInfoParams::STANDARD_DEVIATION_X_WEIGHT_FOR_FIRST = 1.25f; +const float ProximityInfoParams::STANDARD_DEVIATION_Y_WEIGHT_FOR_FIRST = 0.85f; +const float ProximityInfoParams::STANDARD_DEVIATION_X_WEIGHT_FOR_LAST = 1.4f; +const float ProximityInfoParams::STANDARD_DEVIATION_Y_WEIGHT_FOR_LAST = 0.95f; +const float ProximityInfoParams::STANDARD_DEVIATION_X_WEIGHT = 1.1f; +const float ProximityInfoParams::STANDARD_DEVIATION_Y_WEIGHT = 0.95f; + +// Used by ProximityInfoStateUtils::suppressCharProbabilities() +const float ProximityInfoParams::SUPPRESSION_LENGTH_WEIGHT = 1.5f; +const float ProximityInfoParams::MIN_SUPPRESSION_RATE = 0.1f; +const float ProximityInfoParams::SUPPRESSION_WEIGHT = 0.5f; +const float ProximityInfoParams::SUPPRESSION_WEIGHT_FOR_PROBABILITY_GAIN = 0.1f; +const float ProximityInfoParams::SKIP_PROBABALITY_WEIGHT_FOR_PROBABILITY_GAIN = 0.3f; + +// Used by ProximityInfoStateUtils::getMostProbableString() +const float ProximityInfoParams::DEMOTION_LOG_PROBABILITY = 0.3f; + +// Used by ProximityInfoStateUtils::updateSampledSearchKeySets() +// TODO: Investigate if this is required +const float ProximityInfoParams::SEARCH_KEY_RADIUS_RATIO = 0.95f; + +// Used by ProximityInfoStateUtils::calculateBeelineSpeedRate() +const int ProximityInfoParams::LOOKUP_RADIUS_PERCENTILE = 50; +const int ProximityInfoParams::FIRST_POINT_TIME_OFFSET_MILLIS = 150; +const int ProximityInfoParams::STRONG_DOUBLE_LETTER_TIME_MILLIS = 600; + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info_params.h b/app/src/main/jni/src/suggest/core/layout/proximity_info_params.h new file mode 100644 index 000000000..d9515c837 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info_params.h @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_PARAMS_H +#define LATINIME_PROXIMITY_INFO_PARAMS_H + +#include "defines.h" + +namespace latinime { + +class ProximityInfoParams { + public: + static const float NOT_A_DISTANCE_FLOAT; + static const int MIN_DOUBLE_LETTER_BEELINE_SPEED_PERCENTILE; + static const float VERTICAL_SWEET_SPOT_SCALE; + static const float VERTICAL_SWEET_SPOT_SCALE_G; + + // Used by ProximityInfoStateUtils::updateNearKeysDistances() + static const float NEAR_KEY_THRESHOLD_FOR_DISTANCE; + + // Used by ProximityInfoStateUtils::isPrevLocalMin() + static const float MARGIN_FOR_PREV_LOCAL_MIN; + + // Used by ProximityInfoStateUtils::getPointScore() + static const int DISTANCE_BASE_SCALE; + static const float NEAR_KEY_THRESHOLD_FOR_POINT_SCORE; + static const int CORNER_CHECK_DISTANCE_THRESHOLD_SCALE; + static const float NOT_LOCALMIN_DISTANCE_SCORE; + static const float LOCALMIN_DISTANCE_AND_NEAR_TO_KEY_SCORE; + static const float CORNER_ANGLE_THRESHOLD_FOR_POINT_SCORE; + static const float CORNER_SUM_ANGLE_THRESHOLD; + static const float CORNER_SCORE; + + // Used by ProximityInfoStateUtils::refreshSpeedRates() + static const int NUM_POINTS_FOR_SPEED_CALCULATION; + + // Used by ProximityInfoStateUtils::pushTouchPoint() + static const int LAST_POINT_SKIP_DISTANCE_SCALE; + + // Used by ProximityInfoStateUtils::updateAlignPointProbabilities() + static const float MIN_PROBABILITY; + static const float MAX_SKIP_PROBABILITY; + static const float SKIP_FIRST_POINT_PROBABILITY; + static const float SKIP_LAST_POINT_PROBABILITY; + static const float MIN_SPEED_RATE_FOR_SKIP_PROBABILITY; + static const float SPEED_WEIGHT_FOR_SKIP_PROBABILITY; + static const float SLOW_STRAIGHT_WEIGHT_FOR_SKIP_PROBABILITY; + static const float NEAREST_DISTANCE_WEIGHT; + static const float NEAREST_DISTANCE_BIAS; + static const float NEAREST_DISTANCE_WEIGHT_FOR_LAST; + static const float NEAREST_DISTANCE_BIAS_FOR_LAST; + static const float ANGLE_WEIGHT; + static const float DEEP_CORNER_ANGLE_THRESHOLD; + static const float SKIP_DEEP_CORNER_PROBABILITY; + static const float CORNER_ANGLE_THRESHOLD; + static const float STRAIGHT_ANGLE_THRESHOLD; + static const float SKIP_CORNER_PROBABILITY; + static const float SPEED_MARGIN; + static const float CENTER_VALUE_OF_NORMALIZED_DISTRIBUTION; + static const float SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION; + static const float MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION; + static const float SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION; + static const float MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION; + static const float MIN_STANDARD_DEVIATION; + // X means gesture's direction. Y means gesture's orthogonal direction. + static const float STANDARD_DEVIATION_X_WEIGHT_FOR_FIRST; + static const float STANDARD_DEVIATION_Y_WEIGHT_FOR_FIRST; + static const float STANDARD_DEVIATION_X_WEIGHT_FOR_LAST; + static const float STANDARD_DEVIATION_Y_WEIGHT_FOR_LAST; + static const float STANDARD_DEVIATION_X_WEIGHT; + static const float STANDARD_DEVIATION_Y_WEIGHT; + + // Used by ProximityInfoStateUtils::suppressCharProbabilities() + static const float SUPPRESSION_LENGTH_WEIGHT; + static const float MIN_SUPPRESSION_RATE; + static const float SUPPRESSION_WEIGHT; + static const float SUPPRESSION_WEIGHT_FOR_PROBABILITY_GAIN; + static const float SKIP_PROBABALITY_WEIGHT_FOR_PROBABILITY_GAIN; + + // Used by ProximityInfoStateUtils::getMostProbableString() + static const float DEMOTION_LOG_PROBABILITY; + + // Used by ProximityInfoStateUtils::updateSampledSearchKeySets() + static const float SEARCH_KEY_RADIUS_RATIO; + + // Used by ProximityInfoStateUtils::calculateBeelineSpeedRate() + static const int LOOKUP_RADIUS_PERCENTILE; + static const int FIRST_POINT_TIME_OFFSET_MILLIS; + static const int STRONG_DOUBLE_LETTER_TIME_MILLIS; + + // Used by ProximityInfoStateUtils::calculateNormalizedSquaredDistance() + static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfoParams); +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_PARAMS_H diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info_state.cpp b/app/src/main/jni/src/suggest/core/layout/proximity_info_state.cpp new file mode 100644 index 000000000..d43a0026a --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info_state.cpp @@ -0,0 +1,306 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: proximity_info_state.cpp" + +#include "suggest/core/layout/proximity_info_state.h" + +#include +#include // for memset() and memmove() +#include // for debug prints +#include +#include + +#include "defines.h" +#include "suggest/core/layout/geometry_utils.h" +#include "suggest/core/layout/proximity_info.h" +#include "suggest/core/layout/proximity_info_state_utils.h" +#include "utils/char_utils.h" + +namespace latinime { + +int ProximityInfoState::getPrimaryOriginalCodePointAt(const int index) const { + const int primaryCodePoint = getPrimaryCodePointAt(index); + const int keyIndex = mProximityInfo->getKeyIndexOf(primaryCodePoint); + return mProximityInfo->getOriginalCodePointOf(keyIndex); +} + +// TODO: Remove the dependency of "isGeometric" +void ProximityInfoState::initInputParams(const int pointerId, const float maxPointToKeyLength, + const ProximityInfo *proximityInfo, const int *const inputCodes, const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times, + const int *const pointerIds, const bool isGeometric, const std::vector *locale) { + ASSERT(isGeometric || (inputSize < MAX_WORD_LENGTH)); + mIsContinuousSuggestionPossible = (mHasBeenUpdatedByGeometricInput != isGeometric) ? + false : ProximityInfoStateUtils::checkAndReturnIsContinuousSuggestionPossible( + inputSize, xCoordinates, yCoordinates, times, mSampledInputSize, + &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledInputIndice); + if (DEBUG_DICT) { + AKLOGI("isContinuousSuggestionPossible = %s", + (mIsContinuousSuggestionPossible ? "true" : "false")); + } + + mProximityInfo = proximityInfo; + mHasTouchPositionCorrectionData = proximityInfo->hasTouchPositionCorrectionData(); + mMostCommonKeyWidthSquare = proximityInfo->getMostCommonKeyWidthSquare(); + mKeyCount = proximityInfo->getKeyCount(); + mCellHeight = proximityInfo->getCellHeight(); + mCellWidth = proximityInfo->getCellWidth(); + mGridHeight = proximityInfo->getGridWidth(); + mGridWidth = proximityInfo->getGridHeight(); + + memset(mInputProximities, 0, sizeof(mInputProximities)); + + if (!isGeometric && pointerId == 0) { + mProximityInfo->initializeProximities(inputCodes, xCoordinates, yCoordinates, + inputSize, mInputProximities, locale); + } + + /////////////////////// + // Setup touch points + int pushTouchPointStartIndex = 0; + int lastSavedInputSize = 0; + mMaxPointToKeyLength = maxPointToKeyLength; + mSampledInputSize = 0; + mMostProbableStringProbability = 0.0f; + + if (mIsContinuousSuggestionPossible && mSampledInputIndice.size() > 1) { + // Just update difference. + // Previous two points are never skipped. Thus, we pop 2 input point data here. + pushTouchPointStartIndex = ProximityInfoStateUtils::trimLastTwoTouchPoints( + &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledLengthCache, + &mSampledInputIndice); + lastSavedInputSize = mSampledInputXs.size(); + } else { + // Clear all data. + mSampledInputXs.clear(); + mSampledInputYs.clear(); + mSampledTimes.clear(); + mSampledInputIndice.clear(); + mSampledLengthCache.clear(); + mSampledNormalizedSquaredLengthCache.clear(); + mSampledSearchKeySets.clear(); + mSpeedRates.clear(); + mBeelineSpeedPercentiles.clear(); + mCharProbabilities.clear(); + mDirections.clear(); + } + + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: reused points = %d, last input size = %d", + pushTouchPointStartIndex, lastSavedInputSize); + } + + if (xCoordinates && yCoordinates) { + mSampledInputSize = ProximityInfoStateUtils::updateTouchPoints(mProximityInfo, + mMaxPointToKeyLength, mInputProximities, xCoordinates, yCoordinates, times, + pointerIds, inputSize, isGeometric, pointerId, + pushTouchPointStartIndex, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, + &mSampledLengthCache, &mSampledInputIndice); + } + + if (mSampledInputSize > 0 && isGeometric) { + mAverageSpeed = ProximityInfoStateUtils::refreshSpeedRates(inputSize, xCoordinates, + yCoordinates, times, lastSavedInputSize, mSampledInputSize, &mSampledInputXs, + &mSampledInputYs, &mSampledTimes, &mSampledLengthCache, &mSampledInputIndice, + &mSpeedRates, &mDirections); + ProximityInfoStateUtils::refreshBeelineSpeedRates(mProximityInfo->getMostCommonKeyWidth(), + mAverageSpeed, inputSize, xCoordinates, yCoordinates, times, mSampledInputSize, + &mSampledInputXs, &mSampledInputYs, &mSampledInputIndice, + &mBeelineSpeedPercentiles); + } + + if (mSampledInputSize > 0) { + ProximityInfoStateUtils::initGeometricDistanceInfos(mProximityInfo, mSampledInputSize, + lastSavedInputSize, isGeometric, &mSampledInputXs, &mSampledInputYs, + &mSampledNormalizedSquaredLengthCache); + if (isGeometric) { + // updates probabilities of skipping or mapping each key for all points. + ProximityInfoStateUtils::updateAlignPointProbabilities( + mMaxPointToKeyLength, mProximityInfo->getMostCommonKeyWidth(), + mProximityInfo->getKeyCount(), lastSavedInputSize, mSampledInputSize, + &mSampledInputXs, &mSampledInputYs, &mSpeedRates, &mSampledLengthCache, + &mSampledNormalizedSquaredLengthCache, mProximityInfo, &mCharProbabilities); + ProximityInfoStateUtils::updateSampledSearchKeySets(mProximityInfo, + mSampledInputSize, lastSavedInputSize, &mSampledLengthCache, + &mCharProbabilities, &mSampledSearchKeySets, + &mSampledSearchKeyVectors); + mMostProbableStringProbability = ProximityInfoStateUtils::getMostProbableString( + mProximityInfo, mSampledInputSize, &mCharProbabilities, mMostProbableString); + + } + } + + if (DEBUG_SAMPLING_POINTS) { + ProximityInfoStateUtils::dump(isGeometric, inputSize, xCoordinates, yCoordinates, + mSampledInputSize, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSpeedRates, + &mBeelineSpeedPercentiles); + } + // end + /////////////////////// + + mTouchPositionCorrectionEnabled = mSampledInputSize > 0 && mHasTouchPositionCorrectionData + && xCoordinates && yCoordinates; + if (!isGeometric && pointerId == 0) { + ProximityInfoStateUtils::initPrimaryInputWord( + inputSize, mInputProximities, mPrimaryInputWord); + } + if (DEBUG_GEO_FULL) { + AKLOGI("ProximityState init finished: %d points out of %d", mSampledInputSize, inputSize); + } + mHasBeenUpdatedByGeometricInput = isGeometric; +} + +// This function basically converts from a length to an edit distance. Accordingly, it's obviously +// wrong to compare with mMaxPointToKeyLength. +float ProximityInfoState::getPointToKeyLength( + const int inputIndex, const int codePoint) const { + const int keyId = mProximityInfo->getKeyIndexOf(codePoint); + if (keyId != NOT_AN_INDEX) { + const int index = inputIndex * mProximityInfo->getKeyCount() + keyId; + return std::min(mSampledNormalizedSquaredLengthCache[index], mMaxPointToKeyLength); + } + if (CharUtils::isIntentionalOmissionCodePoint(codePoint)) { + return 0.0f; + } + // If the char is not a key on the keyboard then return the max length. + return static_cast(MAX_VALUE_FOR_WEIGHTING); +} + +float ProximityInfoState::getPointToKeyByIdLength( + const int inputIndex, const int keyId) const { + return ProximityInfoStateUtils::getPointToKeyByIdLength(mMaxPointToKeyLength, + &mSampledNormalizedSquaredLengthCache, mProximityInfo->getKeyCount(), inputIndex, + keyId); +} + +// In the following function, c is the current character of the dictionary word currently examined. +// currentChars is an array containing the keys close to the character the user actually typed at +// the same position. We want to see if c is in it: if so, then the word contains at that position +// a character close to what the user typed. +// What the user typed is actually the first character of the array. +// proximityIndex is a pointer to the variable where getProximityType returns the index of c +// in the proximity chars of the input index. +// Notice : accented characters do not have a proximity list, so they are alone in their list. The +// non-accented version of the character should be considered "close", but not the other keys close +// to the non-accented version. +ProximityType ProximityInfoState::getProximityType(const int index, const int codePoint, + const bool checkProximityChars, int *proximityIndex) const { + const int *currentCodePoints = getProximityCodePointsAt(index); + const int firstCodePoint = currentCodePoints[0]; + const int baseLowerC = CharUtils::toBaseLowerCase(codePoint); + + // The first char in the array is what user typed. If it matches right away, that means the + // user typed that same char for this pos. + if (firstCodePoint == baseLowerC || firstCodePoint == codePoint) { + return MATCH_CHAR; + } + + if (!checkProximityChars) return SUBSTITUTION_CHAR; + + // If the non-accented, lowercased version of that first character matches c, then we have a + // non-accented version of the accented character the user typed. Treat it as a close char. + if (CharUtils::toBaseLowerCase(firstCodePoint) == baseLowerC) { + return PROXIMITY_CHAR; + } + + // Not an exact nor an accent-alike match: search the list of close keys + int j = 1; + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentCodePoints[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentCodePoints[j] == baseLowerC + || currentCodePoints[j] == codePoint); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return PROXIMITY_CHAR; + } + ++j; + } + if (j < MAX_PROXIMITY_CHARS_SIZE + && currentCodePoints[j] == ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + ++j; + while (j < MAX_PROXIMITY_CHARS_SIZE + && currentCodePoints[j] > ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE) { + const bool matched = (currentCodePoints[j] == baseLowerC + || currentCodePoints[j] == codePoint); + if (matched) { + if (proximityIndex) { + *proximityIndex = j; + } + return ADDITIONAL_PROXIMITY_CHAR; + } + ++j; + } + } + // Was not included, signal this as a substitution character. + return SUBSTITUTION_CHAR; +} + +ProximityType ProximityInfoState::getProximityTypeG(const int index, const int codePoint) const { + if (!isUsed()) { + return UNRELATED_CHAR; + } + const int sampledSearchKeyVectorsSize = static_cast(mSampledSearchKeyVectors.size()); + if (index < 0 || index >= sampledSearchKeyVectorsSize) { + AKLOGE("getProximityTypeG() is called with an invalid index(%d). " + "mSampledSearchKeyVectors.size() = %d, codePoint = %x.", index, + sampledSearchKeyVectorsSize, codePoint); + ASSERT(false); + return UNRELATED_CHAR; + } + const int lowerCodePoint = CharUtils::toLowerCase(codePoint); + const int baseLowerCodePoint = CharUtils::toBaseCodePoint(lowerCodePoint); + for (int i = 0; i < static_cast(mSampledSearchKeyVectors[index].size()); ++i) { + if (mSampledSearchKeyVectors[index][i] == lowerCodePoint + || mSampledSearchKeyVectors[index][i] == baseLowerCodePoint) { + return MATCH_CHAR; + } + } + return UNRELATED_CHAR; +} + +bool ProximityInfoState::isKeyInSerchKeysAfterIndex(const int index, const int keyId) const { + ASSERT(keyId >= 0 && index >= 0 && index < mSampledInputSize); + return mSampledSearchKeySets[index].test(keyId); +} + +float ProximityInfoState::getDirection(const int index0, const int index1) const { + return ProximityInfoStateUtils::getDirection( + &mSampledInputXs, &mSampledInputYs, index0, index1); +} + +float ProximityInfoState::getMostProbableString(int *const codePointBuf) const { + memmove(codePointBuf, mMostProbableString, sizeof(mMostProbableString)); + return mMostProbableStringProbability; +} + +bool ProximityInfoState::hasSpaceProximity(const int index) const { + ASSERT(0 <= index && index < mSampledInputSize); + return mProximityInfo->hasSpaceProximity(getInputX(index), getInputY(index)); +} + +// Returns a probability of mapping index to keyIndex. +float ProximityInfoState::getProbability(const int index, const int keyIndex) const { + ASSERT(0 <= index && index < mSampledInputSize); + std::unordered_map::const_iterator it = mCharProbabilities[index].find(keyIndex); + if (it != mCharProbabilities[index].end()) { + return it->second; + } + return static_cast(MAX_VALUE_FOR_WEIGHTING); +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info_state.h b/app/src/main/jni/src/suggest/core/layout/proximity_info_state.h new file mode 100644 index 000000000..a2d663544 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info_state.h @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_STATE_H +#define LATINIME_PROXIMITY_INFO_STATE_H + +#include // for memset() +#include +#include + +#include "defines.h" +#include "suggest/core/layout/proximity_info_params.h" +#include "suggest/core/layout/proximity_info_state_utils.h" + +namespace latinime { + +class ProximityInfo; + +class ProximityInfoState { + public: + ///////////////////////////////////////// + // Defined in proximity_info_state.cpp // + ///////////////////////////////////////// + void initInputParams(const int pointerId, const float maxPointToKeyLength, + const ProximityInfo *proximityInfo, const int *const inputCodes, + const int inputSize, const int *xCoordinates, const int *yCoordinates, + const int *const times, const int *const pointerIds, const bool isGeometric, + const std::vector *locale); + + ///////////////////////////////////////// + // Defined here // + ///////////////////////////////////////// + AK_FORCE_INLINE ProximityInfoState() + : mProximityInfo(nullptr), mMaxPointToKeyLength(0.0f), mAverageSpeed(0.0f), + mHasTouchPositionCorrectionData(false), mMostCommonKeyWidthSquare(0), + mKeyCount(0), mCellHeight(0), mCellWidth(0), mGridHeight(0), mGridWidth(0), + mIsContinuousSuggestionPossible(false), mHasBeenUpdatedByGeometricInput(false), + mSampledInputXs(), mSampledInputYs(), mSampledTimes(), mSampledInputIndice(), + mSampledLengthCache(), mBeelineSpeedPercentiles(), + mSampledNormalizedSquaredLengthCache(), mSpeedRates(), mDirections(), + mCharProbabilities(), mSampledSearchKeySets(), mSampledSearchKeyVectors(), + mTouchPositionCorrectionEnabled(false), mSampledInputSize(0), + mMostProbableStringProbability(0.0f) { + memset(mInputProximities, 0, sizeof(mInputProximities)); + memset(mPrimaryInputWord, 0, sizeof(mPrimaryInputWord)); + memset(mMostProbableString, 0, sizeof(mMostProbableString)); + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~ProximityInfoState() {} + + inline int getPrimaryCodePointAt(const int index) const { + return getProximityCodePointsAt(index)[0]; + } + + int getPrimaryOriginalCodePointAt(const int index) const; + + inline bool sameAsTyped(const int *word, int length) const { + if (length != mSampledInputSize) { + return false; + } + const int *inputProximities = mInputProximities; + while (length--) { + if (*inputProximities != *word) { + return false; + } + inputProximities += MAX_PROXIMITY_CHARS_SIZE; + word++; + } + return true; + } + + AK_FORCE_INLINE bool existsCodePointInProximityAt(const int index, const int c) const { + const int *codePoints = getProximityCodePointsAt(index); + int i = 0; + while (codePoints[i] > 0 && i < MAX_PROXIMITY_CHARS_SIZE) { + if (codePoints[i++] == c) { + return true; + } + } + return false; + } + + AK_FORCE_INLINE bool existsAdjacentProximityChars(const int index) const { + if (index < 0 || index >= mSampledInputSize) return false; + const int currentCodePoint = getPrimaryCodePointAt(index); + const int leftIndex = index - 1; + if (leftIndex >= 0 && existsCodePointInProximityAt(leftIndex, currentCodePoint)) { + return true; + } + const int rightIndex = index + 1; + if (rightIndex < mSampledInputSize + && existsCodePointInProximityAt(rightIndex, currentCodePoint)) { + return true; + } + return false; + } + + inline bool touchPositionCorrectionEnabled() const { + return mTouchPositionCorrectionEnabled; + } + + bool isUsed() const { + return mSampledInputSize > 0; + } + + int size() const { + return mSampledInputSize; + } + + int getInputX(const int index) const { + return mSampledInputXs[index]; + } + + int getInputY(const int index) const { + return mSampledInputYs[index]; + } + + int getInputIndexOfSampledPoint(const int sampledIndex) const { + return mSampledInputIndice[sampledIndex]; + } + + bool hasSpaceProximity(const int index) const; + + int getLengthCache(const int index) const { + return mSampledLengthCache[index]; + } + + bool isContinuousSuggestionPossible() const { + return mIsContinuousSuggestionPossible; + } + + // TODO: Rename s/Length/NormalizedSquaredLength/ + float getPointToKeyByIdLength(const int inputIndex, const int keyId) const; + // TODO: Rename s/Length/NormalizedSquaredLength/ + float getPointToKeyLength(const int inputIndex, const int codePoint) const; + + ProximityType getProximityType(const int index, const int codePoint, + const bool checkProximityChars, int *proximityIndex = 0) const; + + ProximityType getProximityTypeG(const int index, const int codePoint) const; + + float getSpeedRate(const int index) const { + return mSpeedRates[index]; + } + + AK_FORCE_INLINE int getBeelineSpeedPercentile(const int id) const { + return mBeelineSpeedPercentiles[id]; + } + + AK_FORCE_INLINE DoubleLetterLevel getDoubleLetterLevel(const int id) const { + const int beelineSpeedRate = getBeelineSpeedPercentile(id); + if (beelineSpeedRate == 0) { + return A_STRONG_DOUBLE_LETTER; + } else if (beelineSpeedRate + < ProximityInfoParams::MIN_DOUBLE_LETTER_BEELINE_SPEED_PERCENTILE) { + return A_DOUBLE_LETTER; + } else { + return NOT_A_DOUBLE_LETTER; + } + } + + float getDirection(const int index) const { + return mDirections[index]; + } + // get xy direction + float getDirection(const int x, const int y) const; + + float getMostProbableString(int *const codePointBuf) const; + + float getProbability(const int index, const int charCode) const; + + bool isKeyInSerchKeysAfterIndex(const int index, const int keyId) const; + + private: + DISALLOW_COPY_AND_ASSIGN(ProximityInfoState); + + inline const int *getProximityCodePointsAt(const int index) const { + return ProximityInfoStateUtils::getProximityCodePointsAt(mInputProximities, index); + } + + // const + const ProximityInfo *mProximityInfo; + float mMaxPointToKeyLength; + float mAverageSpeed; + bool mHasTouchPositionCorrectionData; + int mMostCommonKeyWidthSquare; + int mKeyCount; + int mCellHeight; + int mCellWidth; + int mGridHeight; + int mGridWidth; + bool mIsContinuousSuggestionPossible; + bool mHasBeenUpdatedByGeometricInput; + + std::vector mSampledInputXs; + std::vector mSampledInputYs; + std::vector mSampledTimes; + std::vector mSampledInputIndice; + std::vector mSampledLengthCache; + std::vector mBeelineSpeedPercentiles; + std::vector mSampledNormalizedSquaredLengthCache; + std::vector mSpeedRates; + std::vector mDirections; + // probabilities of skipping or mapping to a key for each point. + std::vector> mCharProbabilities; + // The vector for the key code set which holds nearby keys of some trailing sampled input points + // for each sampled input point. These nearby keys contain the next characters which can be in + // the dictionary. Specifically, currently we are looking for keys nearby trailing sampled + // inputs including the current input point. + std::vector mSampledSearchKeySets; + std::vector> mSampledSearchKeyVectors; + bool mTouchPositionCorrectionEnabled; + int mInputProximities[MAX_PROXIMITY_CHARS_SIZE * MAX_WORD_LENGTH]; + int mSampledInputSize; + int mPrimaryInputWord[MAX_WORD_LENGTH]; + float mMostProbableStringProbability; + int mMostProbableString[MAX_WORD_LENGTH]; +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_STATE_H diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info_state_utils.cpp b/app/src/main/jni/src/suggest/core/layout/proximity_info_state_utils.cpp new file mode 100644 index 000000000..0aeb36aad --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info_state_utils.cpp @@ -0,0 +1,1015 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/layout/proximity_info_state_utils.h" + +#include +#include +#include // for memset() +#include // for debug prints +#include +#include + +#include "defines.h" +#include "suggest/core/layout/geometry_utils.h" +#include "suggest/core/layout/normal_distribution_2d.h" +#include "suggest/core/layout/proximity_info.h" +#include "suggest/core/layout/proximity_info_params.h" + +namespace latinime { + +/* static */ int ProximityInfoStateUtils::trimLastTwoTouchPoints(std::vector *sampledInputXs, + std::vector *sampledInputYs, std::vector *sampledInputTimes, + std::vector *sampledLengthCache, std::vector *sampledInputIndice) { + const int nextStartIndex = (*sampledInputIndice)[sampledInputIndice->size() - 2]; + popInputData(sampledInputXs, sampledInputYs, sampledInputTimes, sampledLengthCache, + sampledInputIndice); + popInputData(sampledInputXs, sampledInputYs, sampledInputTimes, sampledLengthCache, + sampledInputIndice); + return nextStartIndex; +} + +/* static */ int ProximityInfoStateUtils::updateTouchPoints( + const ProximityInfo *const proximityInfo, const int maxPointToKeyLength, + const int *const inputProximities, const int *const inputXCoordinates, + const int *const inputYCoordinates, const int *const times, const int *const pointerIds, + const int inputSize, const bool isGeometric, const int pointerId, + const int pushTouchPointStartIndex, std::vector *sampledInputXs, + std::vector *sampledInputYs, std::vector *sampledInputTimes, + std::vector *sampledLengthCache, std::vector *sampledInputIndice) { + if (DEBUG_SAMPLING_POINTS) { + if (times) { + for (int i = 0; i < inputSize; ++i) { + AKLOGI("(%d) x %d, y %d, time %d", + i, inputXCoordinates[i], inputYCoordinates[i], times[i]); + } + } + } +#ifdef DO_ASSERT_TEST + if (times) { + for (int i = 0; i < inputSize; ++i) { + if (i > 0) { + if (times[i] < times[i - 1]) { + AKLOGI("Invalid time sequence. %d, %d", times[i - 1], times[i]); + ASSERT(false); + } + } + } + } +#endif + const bool proximityOnly = !isGeometric + && (inputXCoordinates[0] < 0 || inputYCoordinates[0] < 0); + int lastInputIndex = pushTouchPointStartIndex; + for (int i = lastInputIndex; i < inputSize; ++i) { + const int pid = pointerIds ? pointerIds[i] : 0; + if (pointerId == pid) { + lastInputIndex = i; + } + } + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: last input index = %d", lastInputIndex); + } + // Working space to save near keys distances for current, prev and prevprev input point. + NearKeysDistanceMap nearKeysDistances[3]; + // These pointers are swapped for each inputs points. + NearKeysDistanceMap *currentNearKeysDistances = &nearKeysDistances[0]; + NearKeysDistanceMap *prevNearKeysDistances = &nearKeysDistances[1]; + NearKeysDistanceMap *prevPrevNearKeysDistances = &nearKeysDistances[2]; + // "sumAngle" is accumulated by each angle of input points. And when "sumAngle" exceeds + // the threshold we save that point, reset sumAngle. This aims to keep the figure of + // the curve. + float sumAngle = 0.0f; + + for (int i = pushTouchPointStartIndex; i <= lastInputIndex; ++i) { + // Assuming pointerId == 0 if pointerIds is null. + const int pid = pointerIds ? pointerIds[i] : 0; + if (DEBUG_GEO_FULL) { + AKLOGI("Init ProximityInfoState: (%d)PID = %d", i, pid); + } + if (pointerId == pid) { + const int c = isGeometric ? + NOT_A_COORDINATE : getPrimaryCodePointAt(inputProximities, i); + const int x = proximityOnly ? NOT_A_COORDINATE : inputXCoordinates[i]; + const int y = proximityOnly ? NOT_A_COORDINATE : inputYCoordinates[i]; + const int time = times ? times[i] : -1; + + if (i > 1) { + const float prevAngle = GeometryUtils::getAngle( + inputXCoordinates[i - 2], inputYCoordinates[i - 2], + inputXCoordinates[i - 1], inputYCoordinates[i - 1]); + const float currentAngle = GeometryUtils::getAngle( + inputXCoordinates[i - 1], inputYCoordinates[i - 1], x, y); + sumAngle += GeometryUtils::getAngleDiff(prevAngle, currentAngle); + } + + if (pushTouchPoint(proximityInfo, maxPointToKeyLength, i, c, x, y, time, + isGeometric, isGeometric /* doSampling */, i == lastInputIndex, + sumAngle, currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances, sampledInputXs, sampledInputYs, sampledInputTimes, + sampledLengthCache, sampledInputIndice)) { + // Previous point information was popped. + NearKeysDistanceMap *tmp = prevNearKeysDistances; + prevNearKeysDistances = currentNearKeysDistances; + currentNearKeysDistances = tmp; + } else { + NearKeysDistanceMap *tmp = prevPrevNearKeysDistances; + prevPrevNearKeysDistances = prevNearKeysDistances; + prevNearKeysDistances = currentNearKeysDistances; + currentNearKeysDistances = tmp; + sumAngle = 0.0f; + } + } + } + return sampledInputXs->size(); +} + +/* static */ const int *ProximityInfoStateUtils::getProximityCodePointsAt( + const int *const inputProximities, const int index) { + return inputProximities + (index * MAX_PROXIMITY_CHARS_SIZE); +} + +/* static */ int ProximityInfoStateUtils::getPrimaryCodePointAt(const int *const inputProximities, + const int index) { + return getProximityCodePointsAt(inputProximities, index)[0]; +} + +/* static */ void ProximityInfoStateUtils::initPrimaryInputWord(const int inputSize, + const int *const inputProximities, int *primaryInputWord) { + memset(primaryInputWord, 0, sizeof(primaryInputWord[0]) * MAX_WORD_LENGTH); + for (int i = 0; i < inputSize; ++i) { + primaryInputWord[i] = getPrimaryCodePointAt(inputProximities, i); + } +} + +/* static */ float ProximityInfoStateUtils::calculateSquaredDistanceFromSweetSpotCenter( + const ProximityInfo *const proximityInfo, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int keyIndex, const int inputIndex) { + const float sweetSpotCenterX = proximityInfo->getSweetSpotCenterXAt(keyIndex); + const float sweetSpotCenterY = proximityInfo->getSweetSpotCenterYAt(keyIndex); + const float inputX = static_cast((*sampledInputXs)[inputIndex]); + const float inputY = static_cast((*sampledInputYs)[inputIndex]); + return GeometryUtils::SQUARE_FLOAT(inputX - sweetSpotCenterX) + + GeometryUtils::SQUARE_FLOAT(inputY - sweetSpotCenterY); +} + +/* static */ float ProximityInfoStateUtils::calculateNormalizedSquaredDistance( + const ProximityInfo *const proximityInfo, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int keyIndex, const int inputIndex) { + if (keyIndex == NOT_AN_INDEX) { + return ProximityInfoParams::NOT_A_DISTANCE_FLOAT; + } + if (!proximityInfo->hasSweetSpotData(keyIndex)) { + return ProximityInfoParams::NOT_A_DISTANCE_FLOAT; + } + if (NOT_A_COORDINATE == (*sampledInputXs)[inputIndex]) { + return ProximityInfoParams::NOT_A_DISTANCE_FLOAT; + } + const float squaredDistance = calculateSquaredDistanceFromSweetSpotCenter(proximityInfo, + sampledInputXs, sampledInputYs, keyIndex, inputIndex); + const float squaredRadius = GeometryUtils::SQUARE_FLOAT( + proximityInfo->getSweetSpotRadiiAt(keyIndex)); + return squaredDistance / squaredRadius; +} + +/* static */ void ProximityInfoStateUtils::initGeometricDistanceInfos( + const ProximityInfo *const proximityInfo, const int sampledInputSize, + const int lastSavedInputSize, const bool isGeometric, + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + std::vector *sampledNormalizedSquaredLengthCache) { + const int keyCount = proximityInfo->getKeyCount(); + sampledNormalizedSquaredLengthCache->resize(sampledInputSize * keyCount); + for (int i = lastSavedInputSize; i < sampledInputSize; ++i) { + for (int k = 0; k < keyCount; ++k) { + const int index = i * keyCount + k; + const int x = (*sampledInputXs)[i]; + const int y = (*sampledInputYs)[i]; + const float normalizedSquaredDistance = + proximityInfo->getNormalizedSquaredDistanceFromCenterFloatG( + k, x, y, isGeometric); + (*sampledNormalizedSquaredLengthCache)[index] = normalizedSquaredDistance; + } + } +} + +/* static */ void ProximityInfoStateUtils::popInputData(std::vector *sampledInputXs, + std::vector *sampledInputYs, std::vector *sampledInputTimes, + std::vector *sampledLengthCache, std::vector *sampledInputIndice) { + sampledInputXs->pop_back(); + sampledInputYs->pop_back(); + sampledInputTimes->pop_back(); + sampledLengthCache->pop_back(); + sampledInputIndice->pop_back(); +} + +/* static */ float ProximityInfoStateUtils::refreshSpeedRates(const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times, + const int lastSavedInputSize, const int sampledInputSize, + const std::vector *const sampledInputXs, const std::vector *const sampledInputYs, + const std::vector *const sampledInputTimes, + const std::vector *const sampledLengthCache, + const std::vector *const sampledInputIndice, std::vector *sampledSpeedRates, + std::vector *sampledDirections) { + // Relative speed calculation. + const int sumDuration = sampledInputTimes->back() - sampledInputTimes->front(); + const int sumLength = sampledLengthCache->back() - sampledLengthCache->front(); + const float averageSpeed = static_cast(sumLength) / static_cast(sumDuration); + sampledSpeedRates->resize(sampledInputSize); + for (int i = lastSavedInputSize; i < sampledInputSize; ++i) { + const int index = (*sampledInputIndice)[i]; + int length = 0; + int duration = 0; + + // Calculate velocity by using distances and durations of + // ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION points for both forward and + // backward. + const int forwardNumPoints = std::min(inputSize - 1, + index + ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION); + for (int j = index; j < forwardNumPoints; ++j) { + if (i < sampledInputSize - 1 && j >= (*sampledInputIndice)[i + 1]) { + break; + } + length += GeometryUtils::getDistanceInt(xCoordinates[j], yCoordinates[j], + xCoordinates[j + 1], yCoordinates[j + 1]); + duration += times[j + 1] - times[j]; + } + const int backwardNumPoints = std::max(0, + index - ProximityInfoParams::NUM_POINTS_FOR_SPEED_CALCULATION); + for (int j = index - 1; j >= backwardNumPoints; --j) { + if (i > 0 && j < (*sampledInputIndice)[i - 1]) { + break; + } + // TODO: use mSampledLengthCache instead? + length += GeometryUtils::getDistanceInt(xCoordinates[j], yCoordinates[j], + xCoordinates[j + 1], yCoordinates[j + 1]); + duration += times[j + 1] - times[j]; + } + if (duration == 0 || sumDuration == 0) { + // Cannot calculate speed; thus, it gives an average value (1.0); + (*sampledSpeedRates)[i] = 1.0f; + } else { + const float speed = static_cast(length) / static_cast(duration); + (*sampledSpeedRates)[i] = speed / averageSpeed; + } + } + + // Direction calculation. + sampledDirections->resize(sampledInputSize - 1); + for (int i = std::max(0, lastSavedInputSize - 1); i < sampledInputSize - 1; ++i) { + (*sampledDirections)[i] = getDirection(sampledInputXs, sampledInputYs, i, i + 1); + } + return averageSpeed; +} + +/* static */ void ProximityInfoStateUtils::refreshBeelineSpeedRates(const int mostCommonKeyWidth, + const float averageSpeed, const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *times, const int sampledInputSize, + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const std::vector *const inputIndice, + std::vector *beelineSpeedPercentiles) { + if (DEBUG_SAMPLING_POINTS) { + AKLOGI("--- refresh beeline speed rates"); + } + beelineSpeedPercentiles->resize(sampledInputSize); + for (int i = 0; i < sampledInputSize; ++i) { + (*beelineSpeedPercentiles)[i] = static_cast(calculateBeelineSpeedRate( + mostCommonKeyWidth, averageSpeed, i, inputSize, xCoordinates, yCoordinates, times, + sampledInputSize, sampledInputXs, sampledInputYs, inputIndice) * MAX_PERCENTILE); + } +} + +/* static */float ProximityInfoStateUtils::getDirection( + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int index0, const int index1) { + ASSERT(sampledInputXs && sampledInputYs); + const int sampledInputSize =sampledInputXs->size(); + if (index0 < 0 || index0 > sampledInputSize - 1) { + return 0.0f; + } + if (index1 < 0 || index1 > sampledInputSize - 1) { + return 0.0f; + } + const int x1 = (*sampledInputXs)[index0]; + const int y1 = (*sampledInputYs)[index0]; + const int x2 = (*sampledInputXs)[index1]; + const int y2 = (*sampledInputYs)[index1]; + return GeometryUtils::getAngle(x1, y1, x2, y2); +} + +// Calculating point to key distance for all near keys and returning the distance between +// the given point and the nearest key position. +/* static */ float ProximityInfoStateUtils::updateNearKeysDistances( + const ProximityInfo *const proximityInfo, const float maxPointToKeyLength, const int x, + const int y, const bool isGeometric, NearKeysDistanceMap *const currentNearKeysDistances) { + currentNearKeysDistances->clear(); + const int keyCount = proximityInfo->getKeyCount(); + float nearestKeyDistance = maxPointToKeyLength; + for (int k = 0; k < keyCount; ++k) { + const float dist = proximityInfo->getNormalizedSquaredDistanceFromCenterFloatG(k, x, y, + isGeometric); + if (dist < ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_DISTANCE) { + currentNearKeysDistances->insert(std::pair(k, dist)); + } + if (nearestKeyDistance > dist) { + nearestKeyDistance = dist; + } + } + return nearestKeyDistance; +} + +// Check if previous point is at local minimum position to near keys. +/* static */ bool ProximityInfoStateUtils::isPrevLocalMin( + const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances) { + for (NearKeysDistanceMap::const_iterator it = prevNearKeysDistances->begin(); + it != prevNearKeysDistances->end(); ++it) { + NearKeysDistanceMap::const_iterator itPP = prevPrevNearKeysDistances->find(it->first); + NearKeysDistanceMap::const_iterator itC = currentNearKeysDistances->find(it->first); + const bool isPrevPrevNear = (itPP == prevPrevNearKeysDistances->end() + || itPP->second > it->second + ProximityInfoParams::MARGIN_FOR_PREV_LOCAL_MIN); + const bool isCurrentNear = (itC == currentNearKeysDistances->end() + || itC->second > it->second + ProximityInfoParams::MARGIN_FOR_PREV_LOCAL_MIN); + if (isPrevPrevNear && isCurrentNear) { + return true; + } + } + return false; +} + +// Calculating a point score that indicates usefulness of the point. +/* static */ float ProximityInfoStateUtils::getPointScore(const int mostCommonKeyWidth, + const int x, const int y, const int time, const bool lastPoint, const float nearest, + const float sumAngle, const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector *sampledInputXs, std::vector *sampledInputYs) { + const size_t size = sampledInputXs->size(); + // If there is only one point, add this point. Besides, if the previous point's distance map + // is empty, we re-compute nearby keys distances from the current point. + // Note that the current point is the first point in the incremental input that needs to + // be re-computed. + if (size <= 1 || prevNearKeysDistances->empty()) { + return 0.0f; + } + + const int baseSampleRate = mostCommonKeyWidth; + const int distPrev = GeometryUtils::getDistanceInt(sampledInputXs->back(), + sampledInputYs->back(), (*sampledInputXs)[size - 2], + (*sampledInputYs)[size - 2]) * ProximityInfoParams::DISTANCE_BASE_SCALE; + float score = 0.0f; + + // Location + if (!isPrevLocalMin(currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances)) { + score += ProximityInfoParams::NOT_LOCALMIN_DISTANCE_SCORE; + } else if (nearest < ProximityInfoParams::NEAR_KEY_THRESHOLD_FOR_POINT_SCORE) { + // Promote points nearby keys + score += ProximityInfoParams::LOCALMIN_DISTANCE_AND_NEAR_TO_KEY_SCORE; + } + // Angle + const float angle1 = GeometryUtils::getAngle(x, y, sampledInputXs->back(), + sampledInputYs->back()); + const float angle2 = GeometryUtils::getAngle(sampledInputXs->back(), sampledInputYs->back(), + (*sampledInputXs)[size - 2], (*sampledInputYs)[size - 2]); + const float angleDiff = GeometryUtils::getAngleDiff(angle1, angle2); + + // Save corner + if (distPrev > baseSampleRate * ProximityInfoParams::CORNER_CHECK_DISTANCE_THRESHOLD_SCALE + && (sumAngle > ProximityInfoParams::CORNER_SUM_ANGLE_THRESHOLD + || angleDiff > ProximityInfoParams::CORNER_ANGLE_THRESHOLD_FOR_POINT_SCORE)) { + score += ProximityInfoParams::CORNER_SCORE; + } + return score; +} + +// Sampling touch point and pushing information to vectors. +// Returning if previous point is popped or not. +/* static */ bool ProximityInfoStateUtils::pushTouchPoint(const ProximityInfo *const proximityInfo, + const int maxPointToKeyLength, const int inputIndex, const int nodeCodePoint, int x, int y, + const int time, const bool isGeometric, const bool doSampling, + const bool isLastPoint, const float sumAngle, + NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector *sampledInputXs, std::vector *sampledInputYs, + std::vector *sampledInputTimes, std::vector *sampledLengthCache, + std::vector *sampledInputIndice) { + const int mostCommonKeyWidth = proximityInfo->getMostCommonKeyWidth(); + + size_t size = sampledInputXs->size(); + bool popped = false; + if (nodeCodePoint < 0 && doSampling) { + const float nearest = updateNearKeysDistances(proximityInfo, maxPointToKeyLength, x, y, + isGeometric, currentNearKeysDistances); + const float score = getPointScore(mostCommonKeyWidth, x, y, time, isLastPoint, nearest, + sumAngle, currentNearKeysDistances, prevNearKeysDistances, + prevPrevNearKeysDistances, sampledInputXs, sampledInputYs); + if (score < 0) { + // Pop previous point because it would be useless. + popInputData(sampledInputXs, sampledInputYs, sampledInputTimes, sampledLengthCache, + sampledInputIndice); + size = sampledInputXs->size(); + popped = true; + } else { + popped = false; + } + // Check if the last point should be skipped. + if (isLastPoint && size > 0) { + if (GeometryUtils::getDistanceInt(x, y, sampledInputXs->back(), sampledInputYs->back()) + * ProximityInfoParams::LAST_POINT_SKIP_DISTANCE_SCALE < mostCommonKeyWidth) { + // This point is not used because it's too close to the previous point. + if (DEBUG_GEO_FULL) { + AKLOGI("p0: size = %zd, x = %d, y = %d, lx = %d, ly = %d, dist = %d, " + "width = %d", size, x, y, sampledInputXs->back(), + sampledInputYs->back(), GeometryUtils::getDistanceInt( + x, y, sampledInputXs->back(), sampledInputYs->back()), + mostCommonKeyWidth + / ProximityInfoParams::LAST_POINT_SKIP_DISTANCE_SCALE); + } + return popped; + } + } + } + + if (nodeCodePoint >= 0 && (x < 0 || y < 0)) { + const int keyId = proximityInfo->getKeyIndexOf(nodeCodePoint); + if (keyId >= 0) { + x = proximityInfo->getKeyCenterXOfKeyIdG(keyId, NOT_AN_INDEX, isGeometric); + y = proximityInfo->getKeyCenterYOfKeyIdG(keyId, NOT_AN_INDEX, isGeometric); + } + } + + // Pushing point information. + if (size > 0) { + sampledLengthCache->push_back( + sampledLengthCache->back() + GeometryUtils::getDistanceInt( + x, y, sampledInputXs->back(), sampledInputYs->back())); + } else { + sampledLengthCache->push_back(0); + } + sampledInputXs->push_back(x); + sampledInputYs->push_back(y); + sampledInputTimes->push_back(time); + sampledInputIndice->push_back(inputIndex); + if (DEBUG_GEO_FULL) { + AKLOGI("pushTouchPoint: x = %03d, y = %03d, time = %d, index = %d, popped ? %01d", + x, y, time, inputIndex, popped); + } + return popped; +} + +/* static */ float ProximityInfoStateUtils::calculateBeelineSpeedRate(const int mostCommonKeyWidth, + const float averageSpeed, const int id, const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *times, const int sampledInputSize, + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const sampledInputIndices) { + if (sampledInputSize <= 0 || averageSpeed < 0.001f) { + if (DEBUG_SAMPLING_POINTS) { + AKLOGI("--- invalid state: cancel. size = %d, ave = %f", + sampledInputSize, averageSpeed); + } + return 1.0f; + } + const int lookupRadius = mostCommonKeyWidth + * ProximityInfoParams::LOOKUP_RADIUS_PERCENTILE / MAX_PERCENTILE; + const int x0 = (*sampledInputXs)[id]; + const int y0 = (*sampledInputYs)[id]; + const int actualInputIndex = (*sampledInputIndices)[id]; + int tempTime = 0; + int tempBeelineDistance = 0; + int start = actualInputIndex; + // lookup forward + while (start > 0 && tempBeelineDistance < lookupRadius) { + tempTime += times[start] - times[start - 1]; + --start; + tempBeelineDistance = GeometryUtils::getDistanceInt(x0, y0, xCoordinates[start], + yCoordinates[start]); + } + // Exclusive unless this is an edge point + if (start > 0 && start < actualInputIndex) { + ++start; + } + tempTime= 0; + tempBeelineDistance = 0; + int end = actualInputIndex; + // lookup backward + while (end < (inputSize - 1) && tempBeelineDistance < lookupRadius) { + tempTime += times[end + 1] - times[end]; + ++end; + tempBeelineDistance = GeometryUtils::getDistanceInt(x0, y0, xCoordinates[end], + yCoordinates[end]); + } + // Exclusive unless this is an edge point + if (end > actualInputIndex && end < (inputSize - 1)) { + --end; + } + + if (start >= end) { + if (DEBUG_DOUBLE_LETTER) { + AKLOGI("--- double letter: start == end %d", start); + } + return 1.0f; + } + + const int x2 = xCoordinates[start]; + const int y2 = yCoordinates[start]; + const int x3 = xCoordinates[end]; + const int y3 = yCoordinates[end]; + const int beelineDistance = GeometryUtils::getDistanceInt(x2, y2, x3, y3); + int adjustedStartTime = times[start]; + if (start == 0 && actualInputIndex == 0 && inputSize > 1) { + adjustedStartTime += ProximityInfoParams::FIRST_POINT_TIME_OFFSET_MILLIS; + } + int adjustedEndTime = times[end]; + if (end == (inputSize - 1) && inputSize > 1) { + adjustedEndTime -= ProximityInfoParams::FIRST_POINT_TIME_OFFSET_MILLIS; + } + const int time = adjustedEndTime - adjustedStartTime; + if (time <= 0) { + return 1.0f; + } + + if (time >= ProximityInfoParams::STRONG_DOUBLE_LETTER_TIME_MILLIS){ + return 0.0f; + } + if (DEBUG_DOUBLE_LETTER) { + AKLOGI("--- (%d, %d) double letter: start = %d, end = %d, dist = %d, time = %d," + " speed = %f, ave = %f, val = %f, start time = %d, end time = %d", + id, (*sampledInputIndices)[id], start, end, beelineDistance, time, + (static_cast(beelineDistance) / static_cast(time)), averageSpeed, + ((static_cast(beelineDistance) / static_cast(time)) + / averageSpeed), adjustedStartTime, adjustedEndTime); + } + // Offset 1% + // TODO: Detect double letter more smartly + return 0.01f + static_cast(beelineDistance) / static_cast(time) / averageSpeed; +} + +/* static */ float ProximityInfoStateUtils::getPointAngle( + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int index) { + if (!sampledInputXs || !sampledInputYs) { + return 0.0f; + } + const int sampledInputSize = sampledInputXs->size(); + if (index <= 0 || index >= sampledInputSize - 1) { + return 0.0f; + } + const float previousDirection = getDirection(sampledInputXs, sampledInputYs, index - 1, index); + const float nextDirection = getDirection(sampledInputXs, sampledInputYs, index, index + 1); + const float directionDiff = GeometryUtils::getAngleDiff(previousDirection, nextDirection); + return directionDiff; +} + +/* static */ float ProximityInfoStateUtils::getPointsAngle( + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const int index0, const int index1, const int index2) { + if (!sampledInputXs || !sampledInputYs) { + return 0.0f; + } + const int sampledInputSize = sampledInputXs->size(); + if (index0 < 0 || index0 > sampledInputSize - 1) { + return 0.0f; + } + if (index1 < 0 || index1 > sampledInputSize - 1) { + return 0.0f; + } + if (index2 < 0 || index2 > sampledInputSize - 1) { + return 0.0f; + } + const float previousDirection = getDirection(sampledInputXs, sampledInputYs, index0, index1); + const float nextDirection = getDirection(sampledInputXs, sampledInputYs, index1, index2); + return GeometryUtils::getAngleDiff(previousDirection, nextDirection); +} + +// This function basically converts from a length to an edit distance. Accordingly, it's obviously +// wrong to compare with mMaxPointToKeyLength. +/* static */ float ProximityInfoStateUtils::getPointToKeyByIdLength(const float maxPointToKeyLength, + const std::vector *const sampledNormalizedSquaredLengthCache, const int keyCount, + const int inputIndex, const int keyId) { + if (keyId != NOT_AN_INDEX) { + const int index = inputIndex * keyCount + keyId; + return std::min((*sampledNormalizedSquaredLengthCache)[index], maxPointToKeyLength); + } + // If the char is not a key on the keyboard then return the max length. + return static_cast(MAX_VALUE_FOR_WEIGHTING); +} + +// Updates probabilities of aligning to some keys and skipping. +// Word suggestion should be based on this probabilities. +/* static */ void ProximityInfoStateUtils::updateAlignPointProbabilities( + const float maxPointToKeyLength, const int mostCommonKeyWidth, const int keyCount, + const int start, const int sampledInputSize, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const sampledSpeedRates, + const std::vector *const sampledLengthCache, + const std::vector *const sampledNormalizedSquaredLengthCache, + const ProximityInfo *const proximityInfo, + std::vector> *charProbabilities) { + charProbabilities->resize(sampledInputSize); + // Calculates probabilities of using a point as a correlated point with the character + // for each point. + for (int i = start; i < sampledInputSize; ++i) { + (*charProbabilities)[i].clear(); + // First, calculates skip probability. Starts from MAX_SKIP_PROBABILITY. + // Note that all values that are multiplied to this probability should be in [0.0, 1.0]; + float skipProbability = ProximityInfoParams::MAX_SKIP_PROBABILITY; + + const float currentAngle = getPointAngle(sampledInputXs, sampledInputYs, i); + const float speedRate = (*sampledSpeedRates)[i]; + + float nearestKeyDistance = static_cast(MAX_VALUE_FOR_WEIGHTING); + for (int j = 0; j < keyCount; ++j) { + const float distance = getPointToKeyByIdLength( + maxPointToKeyLength, sampledNormalizedSquaredLengthCache, keyCount, i, j); + if (distance < nearestKeyDistance) { + nearestKeyDistance = distance; + } + } + + if (i == 0) { + skipProbability *= std::min(1.0f, + nearestKeyDistance * ProximityInfoParams::NEAREST_DISTANCE_WEIGHT + + ProximityInfoParams::NEAREST_DISTANCE_BIAS); + // Promote the first point + skipProbability *= ProximityInfoParams::SKIP_FIRST_POINT_PROBABILITY; + } else if (i == sampledInputSize - 1) { + skipProbability *= std::min(1.0f, + nearestKeyDistance * ProximityInfoParams::NEAREST_DISTANCE_WEIGHT_FOR_LAST + + ProximityInfoParams::NEAREST_DISTANCE_BIAS_FOR_LAST); + // Promote the last point + skipProbability *= ProximityInfoParams::SKIP_LAST_POINT_PROBABILITY; + } else { + // If the current speed is relatively slower than adjacent keys, we promote this point. + if ((*sampledSpeedRates)[i - 1] - ProximityInfoParams::SPEED_MARGIN > speedRate + && speedRate + < (*sampledSpeedRates)[i + 1] - ProximityInfoParams::SPEED_MARGIN) { + if (currentAngle < ProximityInfoParams::CORNER_ANGLE_THRESHOLD) { + skipProbability *= std::min(1.0f, speedRate + * ProximityInfoParams::SLOW_STRAIGHT_WEIGHT_FOR_SKIP_PROBABILITY); + } else { + // If the angle is small enough, we promote this point more. (e.g. pit vs put) + skipProbability *= std::min(1.0f, + speedRate * ProximityInfoParams::SPEED_WEIGHT_FOR_SKIP_PROBABILITY + + ProximityInfoParams::MIN_SPEED_RATE_FOR_SKIP_PROBABILITY); + } + } + + skipProbability *= std::min(1.0f, + speedRate * nearestKeyDistance * ProximityInfoParams::NEAREST_DISTANCE_WEIGHT + + ProximityInfoParams::NEAREST_DISTANCE_BIAS); + + // Adjusts skip probability by a rate depending on angle. + // ANGLE_RATE of skipProbability is adjusted by current angle. + skipProbability *= (M_PI_F - currentAngle) / M_PI_F * ProximityInfoParams::ANGLE_WEIGHT + + (1.0f - ProximityInfoParams::ANGLE_WEIGHT); + if (currentAngle > ProximityInfoParams::DEEP_CORNER_ANGLE_THRESHOLD) { + skipProbability *= ProximityInfoParams::SKIP_DEEP_CORNER_PROBABILITY; + } + // We assume the angle of this point is the angle for point[i], point[i - 2] + // and point[i - 3]. The reason why we don't use the angle for point[i], point[i - 1] + // and point[i - 2] is this angle can be more affected by the noise. + const float prevAngle = getPointsAngle(sampledInputXs, sampledInputYs, i, i - 2, i - 3); + if (i >= 3 && prevAngle < ProximityInfoParams::STRAIGHT_ANGLE_THRESHOLD + && currentAngle > ProximityInfoParams::CORNER_ANGLE_THRESHOLD) { + skipProbability *= ProximityInfoParams::SKIP_CORNER_PROBABILITY; + } + } + + // probabilities must be in [0.0, ProximityInfoParams::MAX_SKIP_PROBABILITY]; + ASSERT(skipProbability >= 0.0f); + ASSERT(skipProbability <= ProximityInfoParams::MAX_SKIP_PROBABILITY); + (*charProbabilities)[i][NOT_AN_INDEX] = skipProbability; + + // Second, calculates key probabilities by dividing the rest probability + // (1.0f - skipProbability). + const float inputCharProbability = 1.0f - skipProbability; + + const float speedMultipliedByAngleRate = std::min(speedRate * currentAngle / M_PI_F + * ProximityInfoParams::SPEEDxANGLE_WEIGHT_FOR_STANDARD_DEVIATION, + ProximityInfoParams::MAX_SPEEDxANGLE_RATE_FOR_STANDARD_DEVIATION); + const float speedMultipliedByNearestKeyDistanceRate = std::min( + speedRate * nearestKeyDistance + * ProximityInfoParams::SPEEDxNEAREST_WEIGHT_FOR_STANDARD_DEVIATION, + ProximityInfoParams::MAX_SPEEDxNEAREST_RATE_FOR_STANDARD_DEVIATION); + const float sigma = (speedMultipliedByAngleRate + speedMultipliedByNearestKeyDistanceRate + + ProximityInfoParams::MIN_STANDARD_DEVIATION) * mostCommonKeyWidth; + float theta = 0.0f; + // TODO: Use different metrics to compute sigmas. + float sigmaX = sigma; + float sigmaY = sigma; + if (i == 0 && i != sampledInputSize - 1) { + // First point + theta = getDirection(sampledInputXs, sampledInputYs, i + 1, i); + sigmaX *= ProximityInfoParams::STANDARD_DEVIATION_X_WEIGHT_FOR_FIRST; + sigmaY *= ProximityInfoParams::STANDARD_DEVIATION_Y_WEIGHT_FOR_FIRST; + } else { + if (i == sampledInputSize - 1) { + // Last point + sigmaX *= ProximityInfoParams::STANDARD_DEVIATION_X_WEIGHT_FOR_LAST; + sigmaY *= ProximityInfoParams::STANDARD_DEVIATION_Y_WEIGHT_FOR_LAST; + } else { + sigmaX *= ProximityInfoParams::STANDARD_DEVIATION_X_WEIGHT; + sigmaY *= ProximityInfoParams::STANDARD_DEVIATION_Y_WEIGHT; + } + theta = getDirection(sampledInputXs, sampledInputYs, i, i - 1); + } + NormalDistribution2D distribution((*sampledInputXs)[i], sigmaX, (*sampledInputYs)[i], + sigmaY, theta); + // Summing up probability densities of all near keys. + float sumOfProbabilityDensities = 0.0f; + for (int j = 0; j < keyCount; ++j) { + sumOfProbabilityDensities += distribution.getProbabilityDensity( + proximityInfo->getKeyCenterXOfKeyIdG(j, + NOT_A_COORDINATE /* referencePointX */, true /* isGeometric */), + proximityInfo->getKeyCenterYOfKeyIdG(j, + NOT_A_COORDINATE /* referencePointY */, true /* isGeometric */)); + } + + // Split the probability of an input point to keys that are close to the input point. + for (int j = 0; j < keyCount; ++j) { + const float probabilityDensity = distribution.getProbabilityDensity( + proximityInfo->getKeyCenterXOfKeyIdG(j, + NOT_A_COORDINATE /* referencePointX */, true /* isGeometric */), + proximityInfo->getKeyCenterYOfKeyIdG(j, + NOT_A_COORDINATE /* referencePointY */, true /* isGeometric */)); + const float probability = inputCharProbability * probabilityDensity + / sumOfProbabilityDensities; + (*charProbabilities)[i][j] = probability; + } + } + + if (DEBUG_POINTS_PROBABILITY) { + for (int i = 0; i < sampledInputSize; ++i) { + std::stringstream sstream; + sstream << i << ", "; + sstream << "(" << (*sampledInputXs)[i] << ", " << (*sampledInputYs)[i] << "), "; + sstream << "Speed: "<< (*sampledSpeedRates)[i] << ", "; + sstream << "Angle: "<< getPointAngle(sampledInputXs, sampledInputYs, i) << ", \n"; + + for (std::unordered_map::iterator it = (*charProbabilities)[i].begin(); + it != (*charProbabilities)[i].end(); ++it) { + if (it->first == NOT_AN_INDEX) { + sstream << it->first + << "(skip):" + << it->second + << "\n"; + } else { + sstream << it->first + << "(" + //<< static_cast(mProximityInfo->getCodePointOf(it->first)) + << "):" + << it->second + << "\n"; + } + } + AKLOGI("%s", sstream.str().c_str()); + } + } + + // Decrease key probabilities of points which don't have the highest probability of that key + // among nearby points. Probabilities of the first point and the last point are not suppressed. + for (int i = std::max(start, 1); i < sampledInputSize; ++i) { + for (int j = i + 1; j < sampledInputSize; ++j) { + if (!suppressCharProbabilities( + mostCommonKeyWidth, sampledInputSize, sampledLengthCache, i, j, + charProbabilities)) { + break; + } + } + for (int j = i - 1; j >= std::max(start, 0); --j) { + if (!suppressCharProbabilities( + mostCommonKeyWidth, sampledInputSize, sampledLengthCache, i, j, + charProbabilities)) { + break; + } + } + } + + // Converting from raw probabilities to log probabilities to calculate spatial distance. + for (int i = start; i < sampledInputSize; ++i) { + for (int j = 0; j < keyCount; ++j) { + std::unordered_map::iterator it = (*charProbabilities)[i].find(j); + if (it == (*charProbabilities)[i].end()){ + continue; + } else if(it->second < ProximityInfoParams::MIN_PROBABILITY) { + // Erases from near keys vector because it has very low probability. + (*charProbabilities)[i].erase(j); + } else { + it->second = -logf(it->second); + } + } + (*charProbabilities)[i][NOT_AN_INDEX] = -logf((*charProbabilities)[i][NOT_AN_INDEX]); + } +} + +/* static */ void ProximityInfoStateUtils::updateSampledSearchKeySets( + const ProximityInfo *const proximityInfo, const int sampledInputSize, + const int lastSavedInputSize, const std::vector *const sampledLengthCache, + const std::vector> *const charProbabilities, + std::vector *sampledSearchKeySets, + std::vector> *sampledSearchKeyVectors) { + sampledSearchKeySets->resize(sampledInputSize); + sampledSearchKeyVectors->resize(sampledInputSize); + const int readForwordLength = static_cast( + hypotf(proximityInfo->getKeyboardWidth(), proximityInfo->getKeyboardHeight()) + * ProximityInfoParams::SEARCH_KEY_RADIUS_RATIO); + for (int i = 0; i < sampledInputSize; ++i) { + if (i >= lastSavedInputSize) { + (*sampledSearchKeySets)[i].reset(); + } + for (int j = std::max(i, lastSavedInputSize); j < sampledInputSize; ++j) { + // TODO: Investigate if this is required. This may not fail. + if ((*sampledLengthCache)[j] - (*sampledLengthCache)[i] >= readForwordLength) { + break; + } + for(const auto& charProbability : charProbabilities->at(j)) { + if (charProbability.first == NOT_AN_INDEX) { + continue; + } + (*sampledSearchKeySets)[i].set(charProbability.first); + } + } + } + const int keyCount = proximityInfo->getKeyCount(); + for (int i = 0; i < sampledInputSize; ++i) { + std::vector *searchKeyVector = &(*sampledSearchKeyVectors)[i]; + searchKeyVector->clear(); + for (int j = 0; j < keyCount; ++j) { + if ((*sampledSearchKeySets)[i].test(j)) { + const int keyCodePoint = proximityInfo->getCodePointOf(j); + if (std::find(searchKeyVector->begin(), searchKeyVector->end(), keyCodePoint) + == searchKeyVector->end()) { + searchKeyVector->push_back(keyCodePoint); + } + } + } + } +} + +// Decreases char probabilities of index0 by checking probabilities of a near point (index1) and +// increases char probabilities of index1 by checking probabilities of index0. +/* static */ bool ProximityInfoStateUtils::suppressCharProbabilities(const int mostCommonKeyWidth, + const int sampledInputSize, const std::vector *const lengthCache, + const int index0, const int index1, + std::vector> *charProbabilities) { + ASSERT(0 <= index0 && index0 < sampledInputSize); + ASSERT(0 <= index1 && index1 < sampledInputSize); + const float keyWidthFloat = static_cast(mostCommonKeyWidth); + const float diff = fabsf(static_cast((*lengthCache)[index0] - (*lengthCache)[index1])); + if (diff > keyWidthFloat * ProximityInfoParams::SUPPRESSION_LENGTH_WEIGHT) { + return false; + } + const float suppressionRate = ProximityInfoParams::MIN_SUPPRESSION_RATE + + diff / keyWidthFloat / ProximityInfoParams::SUPPRESSION_LENGTH_WEIGHT + * ProximityInfoParams::SUPPRESSION_WEIGHT; + for (std::unordered_map::iterator it = (*charProbabilities)[index0].begin(); + it != (*charProbabilities)[index0].end(); ++it) { + std::unordered_map::iterator it2 = (*charProbabilities)[index1].find(it->first); + if (it2 != (*charProbabilities)[index1].end() && it->second < it2->second) { + const float newProbability = it->second * suppressionRate; + const float suppression = it->second - newProbability; + it->second = newProbability; + // mCharProbabilities[index0][NOT_AN_INDEX] is the probability of skipping this point. + (*charProbabilities)[index0][NOT_AN_INDEX] += suppression; + + // Add the probability of the same key nearby index1 + const float probabilityGain = std::min(suppression + * ProximityInfoParams::SUPPRESSION_WEIGHT_FOR_PROBABILITY_GAIN, + (*charProbabilities)[index1][NOT_AN_INDEX] + * ProximityInfoParams::SKIP_PROBABALITY_WEIGHT_FOR_PROBABILITY_GAIN); + it2->second += probabilityGain; + (*charProbabilities)[index1][NOT_AN_INDEX] -= probabilityGain; + } + } + return true; +} + +/* static */ bool ProximityInfoStateUtils::checkAndReturnIsContinuousSuggestionPossible( + const int inputSize, const int *const xCoordinates, const int *const yCoordinates, + const int *const times, const int sampledInputSize, + const std::vector *const sampledInputXs, const std::vector *const sampledInputYs, + const std::vector *const sampledTimes, + const std::vector *const sampledInputIndices) { + if (inputSize < sampledInputSize) { + return false; + } + for (int i = 0; i < sampledInputSize; ++i) { + const int index = (*sampledInputIndices)[i]; + if (index >= inputSize) { + return false; + } + if (xCoordinates[index] != (*sampledInputXs)[i] + || yCoordinates[index] != (*sampledInputYs)[i]) { + return false; + } + if (!times) { + continue; + } + if (times[index] != (*sampledTimes)[i]) { + return false; + } + } + return true; +} + +// Get a word that is detected by tracing the most probable string into codePointBuf and +// returns probability of generating the word. +/* static */ float ProximityInfoStateUtils::getMostProbableString( + const ProximityInfo *const proximityInfo, const int sampledInputSize, + const std::vector> *const charProbabilities, + int *const codePointBuf) { + ASSERT(sampledInputSize >= 0); + memset(codePointBuf, 0, sizeof(codePointBuf[0]) * MAX_WORD_LENGTH); + int index = 0; + float sumLogProbability = 0.0f; + // TODO: Current implementation is greedy algorithm. DP would be efficient for many cases. + for (int i = 0; i < sampledInputSize && index < MAX_WORD_LENGTH - 1; ++i) { + float minLogProbability = static_cast(MAX_VALUE_FOR_WEIGHTING); + int character = NOT_AN_INDEX; + for (std::unordered_map::const_iterator it = (*charProbabilities)[i].begin(); + it != (*charProbabilities)[i].end(); ++it) { + const float logProbability = (it->first != NOT_AN_INDEX) + ? it->second + ProximityInfoParams::DEMOTION_LOG_PROBABILITY : it->second; + if (logProbability < minLogProbability) { + minLogProbability = logProbability; + character = it->first; + } + } + if (character != NOT_AN_INDEX) { + const int codePoint = proximityInfo->getCodePointOf(character); + if (codePoint == NOT_A_CODE_POINT) { + AKLOGE("Key index(%d) is not found. Cannot construct most probable string", + character); + ASSERT(false); + // Make the length zero, which means most probable string won't be used. + index = 0; + break; + } + codePointBuf[index] = codePoint; + index++; + } + sumLogProbability += minLogProbability; + } + codePointBuf[index] = '\0'; + return sumLogProbability; +} + +/* static */ void ProximityInfoStateUtils::dump(const bool isGeometric, const int inputSize, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int sampledInputSize, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const sampledTimes, + const std::vector *const sampledSpeedRates, + const std::vector *const sampledBeelineSpeedPercentiles) { + if (DEBUG_GEO_FULL) { + for (int i = 0; i < sampledInputSize; ++i) { + AKLOGI("Sampled(%d): x = %d, y = %d, time = %d", i, (*sampledInputXs)[i], + (*sampledInputYs)[i], sampledTimes ? (*sampledTimes)[i] : -1); + } + } + + std::stringstream originalX, originalY, sampledX, sampledY; + for (int i = 0; i < inputSize; ++i) { + originalX << inputXCoordinates[i]; + originalY << inputYCoordinates[i]; + if (i != inputSize - 1) { + originalX << ";"; + originalY << ";"; + } + } + AKLOGI("===== sampled points ====="); + for (int i = 0; i < sampledInputSize; ++i) { + if (isGeometric) { + AKLOGI("%d: x = %d, y = %d, time = %d, relative speed = %.4f, beeline speed = %d", + i, (*sampledInputXs)[i], (*sampledInputYs)[i], (*sampledTimes)[i], + (*sampledSpeedRates)[i], (*sampledBeelineSpeedPercentiles)[i]); + } + sampledX << (*sampledInputXs)[i]; + sampledY << (*sampledInputYs)[i]; + if (i != sampledInputSize - 1) { + sampledX << ";"; + sampledY << ";"; + } + } + AKLOGI("original points:\n%s, %s,\nsampled points:\n%s, %s,\n", + originalX.str().c_str(), originalY.str().c_str(), sampledX.str().c_str(), + sampledY.str().c_str()); +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info_state_utils.h b/app/src/main/jni/src/suggest/core/layout/proximity_info_state_utils.h new file mode 100644 index 000000000..4043334e6 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info_state_utils.h @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_STATE_UTILS_H +#define LATINIME_PROXIMITY_INFO_STATE_UTILS_H + +#include +#include +#include + +#include "defines.h" + +namespace latinime { +class ProximityInfo; +class ProximityInfoParams; + +class ProximityInfoStateUtils { + public: + typedef std::unordered_map NearKeysDistanceMap; + typedef std::bitset NearKeycodesSet; + + static int trimLastTwoTouchPoints(std::vector *sampledInputXs, + std::vector *sampledInputYs, std::vector *sampledInputTimes, + std::vector *sampledLengthCache, std::vector *sampledInputIndice); + static int updateTouchPoints(const ProximityInfo *const proximityInfo, + const int maxPointToKeyLength, const int *const inputProximities, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int *const times, const int *const pointerIds, const int inputSize, + const bool isGeometric, const int pointerId, const int pushTouchPointStartIndex, + std::vector *sampledInputXs, std::vector *sampledInputYs, + std::vector *sampledInputTimes, std::vector *sampledLengthCache, + std::vector *sampledInputIndice); + static const int *getProximityCodePointsAt(const int *const inputProximities, const int index); + static int getPrimaryCodePointAt(const int *const inputProximities, const int index); + static void popInputData(std::vector *sampledInputXs, std::vector *sampledInputYs, + std::vector *sampledInputTimes, std::vector *sampledLengthCache, + std::vector *sampledInputIndice); + static float refreshSpeedRates(const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *const times, const int lastSavedInputSize, + const int sampledInputSize, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const sampledInputTimes, + const std::vector *const sampledLengthCache, + const std::vector *const sampledInputIndice, + std::vector *sampledSpeedRates, std::vector *sampledDirections); + static void refreshBeelineSpeedRates(const int mostCommonKeyWidth, const float averageSpeed, + const int inputSize, const int *const xCoordinates, const int *const yCoordinates, + const int *times, const int sampledInputSize, + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const std::vector *const inputIndice, + std::vector *beelineSpeedPercentiles); + static float getDirection(const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int index0, const int index1); + static void updateAlignPointProbabilities(const float maxPointToKeyLength, + const int mostCommonKeyWidth, const int keyCount, const int start, + const int sampledInputSize, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const sampledSpeedRates, + const std::vector *const sampledLengthCache, + const std::vector *const sampledNormalizedSquaredLengthCache, + const ProximityInfo *const proximityInfo, + std::vector> *charProbabilities); + static void updateSampledSearchKeySets(const ProximityInfo *const proximityInfo, + const int sampledInputSize, const int lastSavedInputSize, + const std::vector *const sampledLengthCache, + const std::vector> *const charProbabilities, + std::vector *sampledSearchKeySets, + std::vector> *sampledSearchKeyVectors); + static float getPointToKeyByIdLength(const float maxPointToKeyLength, + const std::vector *const sampledNormalizedSquaredLengthCache, const int keyCount, + const int inputIndex, const int keyId); + static void initGeometricDistanceInfos(const ProximityInfo *const proximityInfo, + const int sampledInputSize, const int lastSavedInputSize, const bool isGeometric, + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + std::vector *sampledNormalizedSquaredLengthCache); + static void initPrimaryInputWord(const int inputSize, const int *const inputProximities, + int *primaryInputWord); + static void dump(const bool isGeometric, const int inputSize, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int sampledInputSize, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const sampledTimes, + const std::vector *const sampledSpeedRates, + const std::vector *const sampledBeelineSpeedPercentiles); + static bool checkAndReturnIsContinuousSuggestionPossible(const int inputSize, + const int *const xCoordinates, const int *const yCoordinates, const int *const times, + const int sampledInputSize, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const sampledTimes, + const std::vector *const sampledInputIndices); + // TODO: Move to most_probable_string_utils.h + static float getMostProbableString(const ProximityInfo *const proximityInfo, + const int sampledInputSize, + const std::vector> *const charProbabilities, + int *const codePointBuf); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfoStateUtils); + + static float updateNearKeysDistances(const ProximityInfo *const proximityInfo, + const float maxPointToKeyLength, const int x, const int y, + const bool isGeometric, + NearKeysDistanceMap *const currentNearKeysDistances); + static bool isPrevLocalMin(const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances); + static float getPointScore(const int mostCommonKeyWidth, const int x, const int y, + const int time, const bool lastPoint, const float nearest, const float sumAngle, + const NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector *sampledInputXs, std::vector *sampledInputYs); + static bool pushTouchPoint(const ProximityInfo *const proximityInfo, + const int maxPointToKeyLength, const int inputIndex, const int nodeCodePoint, int x, + int y, const int time, const bool isGeometric, + const bool doSampling, const bool isLastPoint, + const float sumAngle, NearKeysDistanceMap *const currentNearKeysDistances, + const NearKeysDistanceMap *const prevNearKeysDistances, + const NearKeysDistanceMap *const prevPrevNearKeysDistances, + std::vector *sampledInputXs, std::vector *sampledInputYs, + std::vector *sampledInputTimes, std::vector *sampledLengthCache, + std::vector *sampledInputIndice); + static float calculateBeelineSpeedRate(const int mostCommonKeyWidth, const float averageSpeed, + const int id, const int inputSize, const int *const xCoordinates, + const int *const yCoordinates, const int *times, const int sampledInputSize, + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, + const std::vector *const inputIndice); + static float getPointAngle(const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int index); + static float getPointsAngle(const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int index0, const int index1, + const int index2); + static bool suppressCharProbabilities(const int mostCommonKeyWidth, + const int sampledInputSize, const std::vector *const lengthCache, const int index0, + const int index1, std::vector> *charProbabilities); + static float calculateSquaredDistanceFromSweetSpotCenter( + const ProximityInfo *const proximityInfo, const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int keyIndex, + const int inputIndex); + static float calculateNormalizedSquaredDistance(const ProximityInfo *const proximityInfo, + const std::vector *const sampledInputXs, + const std::vector *const sampledInputYs, const int keyIndex, const int inputIndex); +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_STATE_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/layout/proximity_info_utils.h b/app/src/main/jni/src/suggest/core/layout/proximity_info_utils.h new file mode 100644 index 000000000..79d0615b8 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/proximity_info_utils.h @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROXIMITY_INFO_UTILS_H +#define LATINIME_PROXIMITY_INFO_UTILS_H + +#include +#include +#include + +#include "defines.h" +#include "suggest/core/layout/additional_proximity_chars.h" +#include "suggest/core/layout/geometry_utils.h" +#include "utils/char_utils.h" + +namespace latinime { +class ProximityInfoUtils { + public: + static AK_FORCE_INLINE int getKeyIndexOf(const int keyCount, const int c, + const std::unordered_map *const codeToKeyMap) { + if (keyCount == 0) { + // We do not have the coordinate data + return NOT_AN_INDEX; + } + if (c == NOT_A_CODE_POINT) { + return NOT_AN_INDEX; + } + const int lowerCode = CharUtils::toLowerCase(c); + std::unordered_map::const_iterator mapPos = codeToKeyMap->find(lowerCode); + if (mapPos != codeToKeyMap->end()) { + return mapPos->second; + } + return NOT_AN_INDEX; + } + + static AK_FORCE_INLINE void initializeProximities(const int *const inputCodes, + const int *const inputXCoordinates, const int *const inputYCoordinates, + const int inputSize, const int *const keyXCoordinates, + const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, + const int *const proximityCharsArray, const int cellHeight, const int cellWidth, + const int gridWidth, const int mostCommonKeyWidth, const int keyCount, + const std::vector *locale, + const std::unordered_map *const codeToKeyMap, int *inputProximities) { + // Initialize + // - mInputCodes + // - mNormalizedSquaredDistances + // TODO: Merge + for (int i = 0; i < inputSize; ++i) { + const int primaryKey = inputCodes[i]; + const int x = inputXCoordinates[i]; + const int y = inputYCoordinates[i]; + int *proximities = &inputProximities[i * MAX_PROXIMITY_CHARS_SIZE]; + calculateProximities(keyXCoordinates, keyYCoordinates, keyWidths, keyHeights, + proximityCharsArray, cellHeight, cellWidth, gridWidth, mostCommonKeyWidth, + keyCount, x, y, primaryKey, locale, codeToKeyMap, proximities); + } + + if (DEBUG_PROXIMITY_CHARS) { + for (int i = 0; i < inputSize; ++i) { + AKLOGI("---"); + for (int j = 0; j < MAX_PROXIMITY_CHARS_SIZE; ++j) { + int proximityChar = + inputProximities[i * MAX_PROXIMITY_CHARS_SIZE + j]; + proximityChar += 0; + AKLOGI("--- (%d)%c", i, proximityChar); + } + } + } + } + + static AK_FORCE_INLINE int getStartIndexFromCoordinates(const int x, const int y, + const int cellHeight, const int cellWidth, const int gridWidth) { + return ((y / cellHeight) * gridWidth + (x / cellWidth)) * MAX_PROXIMITY_CHARS_SIZE; + } + + static inline float getSquaredDistanceFloat(const float x1, const float y1, const float x2, + const float y2) { + return GeometryUtils::SQUARE_FLOAT(x1 - x2) + GeometryUtils::SQUARE_FLOAT(y1 - y2); + } + + static inline float pointToLineSegSquaredDistanceFloat(const float x, const float y, + const float x1, const float y1, const float x2, const float y2, const bool extend) { + const float ray1x = x - x1; + const float ray1y = y - y1; + const float ray2x = x2 - x1; + const float ray2y = y2 - y1; + + const float dotProduct = ray1x * ray2x + ray1y * ray2y; + const float lineLengthSqr = GeometryUtils::SQUARE_FLOAT(ray2x) + + GeometryUtils::SQUARE_FLOAT(ray2y); + if (lineLengthSqr <= 0.0f) { + // Return point to the point distance. + return getSquaredDistanceFloat(x, y, x1, y1); + } + const float projectionLengthSqr = dotProduct / lineLengthSqr; + + float projectionX; + float projectionY; + if (!extend && projectionLengthSqr < 0.0f) { + projectionX = x1; + projectionY = y1; + } else if (!extend && projectionLengthSqr > 1.0f) { + projectionX = x2; + projectionY = y2; + } else { + projectionX = x1 + projectionLengthSqr * ray2x; + projectionY = y1 + projectionLengthSqr * ray2y; + } + return getSquaredDistanceFloat(x, y, projectionX, projectionY); + } + + static AK_FORCE_INLINE bool isMatchOrProximityChar(const ProximityType type) { + return type == MATCH_CHAR || type == PROXIMITY_CHAR || type == ADDITIONAL_PROXIMITY_CHAR; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProximityInfoUtils); + + static bool isOnKey(const int *const keyXCoordinates, const int *const keyYCoordinates, + const int *const keyWidths, const int *keyHeights, const int keyId, const int x, + const int y) { + if (keyId < 0) return true; // NOT_A_ID is -1, but return whenever < 0 just in case + const int left = keyXCoordinates[keyId]; + const int top = keyYCoordinates[keyId]; + const int right = left + keyWidths[keyId] + 1; + const int bottom = top + keyHeights[keyId]; + return left < right && top < bottom && x >= left && x < right && y >= top && y < bottom; + } + + static AK_FORCE_INLINE void calculateProximities(const int *const keyXCoordinates, + const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, + const int *const proximityCharsArray, const int cellHeight, const int cellWidth, + const int gridWidth, const int mostCommonKeyWidth, const int keyCount, + const int x, const int y, const int primaryKey, const std::vector *locale, + const std::unordered_map *const codeToKeyMap, int *proximities) { + const int mostCommonKeyWidthSquare = mostCommonKeyWidth * mostCommonKeyWidth; + int insertPos = 0; + proximities[insertPos++] = primaryKey; + if (x == NOT_A_COORDINATE || y == NOT_A_COORDINATE) { + for (int i = insertPos; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + proximities[i] = NOT_A_CODE_POINT; + } + return; + } + const int startIndex = getStartIndexFromCoordinates(x, y, cellHeight, cellWidth, gridWidth); + if (startIndex >= 0) { + for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + const int c = proximityCharsArray[startIndex + i]; + if (c < KEYCODE_SPACE || c == primaryKey) { + continue; + } + const int keyIndex = getKeyIndexOf(keyCount, c, codeToKeyMap); + const bool onKey = isOnKey(keyXCoordinates, keyYCoordinates, keyWidths, keyHeights, + keyIndex, x, y); + const int distance = squaredLengthToEdge(keyXCoordinates, keyYCoordinates, + keyWidths, keyHeights, keyIndex, x, y); + if (onKey || distance < mostCommonKeyWidthSquare) { + proximities[insertPos++] = c; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + ASSERT(false); + } + return; + } + } + } + const int additionalProximitySize = + AdditionalProximityChars::getAdditionalCharsSize(locale, primaryKey); + if (additionalProximitySize > 0) { + proximities[insertPos++] = ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + ASSERT(false); + } + return; + } + + const int *additionalProximityChars = + AdditionalProximityChars::getAdditionalChars(locale, primaryKey); + for (int j = 0; j < additionalProximitySize; ++j) { + const int ac = additionalProximityChars[j]; + int k = 0; + for (; k < insertPos; ++k) { + if (ac == proximities[k]) { + break; + } + } + if (k < insertPos) { + continue; + } + proximities[insertPos++] = ac; + if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { + if (DEBUG_DICT) { + ASSERT(false); + } + return; + } + } + } + } + // Add a delimiter for the proximity characters + for (int i = insertPos; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { + proximities[i] = NOT_A_CODE_POINT; + } + } + + static int squaredLengthToEdge(const int *const keyXCoordinates, + const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, + const int keyId, const int x, const int y) { + // NOT_A_ID is -1, but return whenever < 0 just in case + if (keyId < 0) return MAX_VALUE_FOR_WEIGHTING; + const int left = keyXCoordinates[keyId]; + const int top = keyYCoordinates[keyId]; + const int right = left + keyWidths[keyId]; + const int bottom = top + keyHeights[keyId]; + const int edgeX = x < left ? left : (x > right ? right : x); + const int edgeY = y < top ? top : (y > bottom ? bottom : y); + const int dx = x - edgeX; + const int dy = y - edgeY; + return dx * dx + dy * dy; + } +}; +} // namespace latinime +#endif // LATINIME_PROXIMITY_INFO_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/layout/touch_position_correction_utils.h b/app/src/main/jni/src/suggest/core/layout/touch_position_correction_utils.h new file mode 100644 index 000000000..14074c13d --- /dev/null +++ b/app/src/main/jni/src/suggest/core/layout/touch_position_correction_utils.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TOUCH_POSITION_CORRECTION_UTILS_H +#define LATINIME_TOUCH_POSITION_CORRECTION_UTILS_H + +#include + +#include "defines.h" +#include "suggest/core/layout/proximity_info_params.h" + +namespace latinime { +class TouchPositionCorrectionUtils { + public: + static float getSweetSpotFactor(const bool isTouchPositionCorrectionEnabled, + const float normalizedSquaredDistance) { + // Promote or demote the score according to the distance from the sweet spot + static const float A = 0.0f; + static const float B = 0.24f; + static const float C = 1.20f; + static const float R0 = 0.0f; + static const float R1 = 0.25f; // Sweet spot + static const float R2 = 1.0f; + const float x = normalizedSquaredDistance; + if (!isTouchPositionCorrectionEnabled) { + return std::min(C, x); + } + + // factor is a piecewise linear function like: + // C -------------. + // / . + // B / . + // -/ . + // A _-^ . + // . + // R0 R1 R2 . + + if (x < R0) { + return A; + } else if (x < R1) { + return (A * (R1 - x) + B * (x - R0)) / (R1 - R0); + } else if (x < R2) { + return (B * (R2 - x) + C * (x - R1)) / (R2 - R1); + } else { + return C; + } + } + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TouchPositionCorrectionUtils); +}; +} // namespace latinime +#endif // LATINIME_TOUCH_POSITION_CORRECTION_UTILS_H diff --git a/app/src/main/jni/src/suggest/core/policy/scoring.h b/app/src/main/jni/src/suggest/core/policy/scoring.h new file mode 100644 index 000000000..b9dda83ad --- /dev/null +++ b/app/src/main/jni/src/suggest/core/policy/scoring.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SCORING_H +#define LATINIME_SCORING_H + +#include "defines.h" + +namespace latinime { + +class DicNode; +class DicTraverseSession; +class SuggestionResults; + +// This class basically tweaks suggestions and distances apart from CompoundDistance +class Scoring { + public: + virtual int calculateFinalScore(const float compoundDistance, const int inputSize, + const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, + const bool boostExactMatches, const bool hasProbabilityZero) const = 0; + virtual void getMostProbableString(const DicTraverseSession *const traverseSession, + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const = 0; + virtual float getAdjustedWeightOfLangModelVsSpatialModel( + DicTraverseSession *const traverseSession, DicNode *const terminals, + const int size) const = 0; + virtual float getDoubleLetterDemotionDistanceCost( + const DicNode *const terminalDicNode) const = 0; + virtual bool autoCorrectsToMultiWordSuggestionIfTop() const = 0; + virtual bool sameAsTyped(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + protected: + Scoring() {} + virtual ~Scoring() {} + + private: + DISALLOW_COPY_AND_ASSIGN(Scoring); +}; +} // namespace latinime +#endif // LATINIME_SCORING_H diff --git a/app/src/main/jni/src/suggest/core/policy/suggest_policy.h b/app/src/main/jni/src/suggest/core/policy/suggest_policy.h new file mode 100644 index 000000000..5b6402c44 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/policy/suggest_policy.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_POLICY_H +#define LATINIME_SUGGEST_POLICY_H + +#include "defines.h" + +namespace latinime { + +class Traversal; +class Scoring; +class Weighting; + +class SuggestPolicy { + public: + SuggestPolicy() {} + virtual ~SuggestPolicy() {} + virtual const Traversal *getTraversal() const = 0; + virtual const Scoring *getScoring() const = 0; + virtual const Weighting *getWeighting() const = 0; + + private: + DISALLOW_COPY_AND_ASSIGN(SuggestPolicy); +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_POLICY_H diff --git a/app/src/main/jni/src/suggest/core/policy/traversal.h b/app/src/main/jni/src/suggest/core/policy/traversal.h new file mode 100644 index 000000000..5b6616d9a --- /dev/null +++ b/app/src/main/jni/src/suggest/core/policy/traversal.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TRAVERSAL_H +#define LATINIME_TRAVERSAL_H + +#include "defines.h" + +namespace latinime { + +class DicTraverseSession; + +class Traversal { + public: + virtual int getMaxPointerCount() const = 0; + virtual bool allowsErrorCorrections(const DicNode *const dicNode) const = 0; + virtual bool isOmission(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const DicNode *const childDicNode, + const bool allowsErrorCorrections) const = 0; + virtual bool isSpaceSubstitutionTerminal(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual bool isSpaceOmissionTerminal(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual bool shouldDepthLevelCache(const DicTraverseSession *const traverseSession) const = 0; + virtual bool shouldNodeLevelCache(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual bool canDoLookAheadCorrection(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + virtual ProximityType getProximityType(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const DicNode *const childDicNode) const = 0; + virtual bool needsToTraverseAllUserInput() const = 0; + virtual float getMaxSpatialDistance() const = 0; + virtual int getDefaultExpandDicNodeSize() const = 0; + virtual int getMaxCacheSize(const int inputSize, const float weightForLocale) const = 0; + virtual int getTerminalCacheSize() const = 0; + virtual bool isPossibleOmissionChildNode(const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + virtual bool isGoodToTraverseNextWord(const DicNode *const dicNode, + const int probability) const = 0; + + protected: + Traversal() {} + virtual ~Traversal() {} + + private: + DISALLOW_COPY_AND_ASSIGN(Traversal); +}; +} // namespace latinime +#endif // LATINIME_TRAVERSAL_H diff --git a/app/src/main/jni/src/suggest/core/policy/weighting.cpp b/app/src/main/jni/src/suggest/core/policy/weighting.cpp new file mode 100644 index 000000000..450203d98 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/policy/weighting.cpp @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/policy/weighting.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_profiler.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" +#include "suggest/core/session/dic_traverse_session.h" + +namespace latinime { + +class MultiBigramMap; + +static inline void profile(const CorrectionType correctionType, DicNode *const node) { +#if DEBUG_DICT + switch (correctionType) { + case CT_OMISSION: + PROF_OMISSION(node->mProfiler); + return; + case CT_ADDITIONAL_PROXIMITY: + PROF_ADDITIONAL_PROXIMITY(node->mProfiler); + return; + case CT_SUBSTITUTION: + PROF_SUBSTITUTION(node->mProfiler); + return; + case CT_NEW_WORD_SPACE_OMISSION: + PROF_NEW_WORD(node->mProfiler); + return; + case CT_MATCH: + PROF_MATCH(node->mProfiler); + return; + case CT_COMPLETION: + PROF_COMPLETION(node->mProfiler); + return; + case CT_TERMINAL: + PROF_TERMINAL(node->mProfiler); + return; + case CT_TERMINAL_INSERTION: + PROF_TERMINAL_INSERTION(node->mProfiler); + return; + case CT_NEW_WORD_SPACE_SUBSTITUTION: + PROF_SPACE_SUBSTITUTION(node->mProfiler); + return; + case CT_INSERTION: + PROF_INSERTION(node->mProfiler); + return; + case CT_TRANSPOSITION: + PROF_TRANSPOSITION(node->mProfiler); + return; + default: + // do nothing + return; + } +#else + // do nothing +#endif +} + +/* static */ void Weighting::addCostAndForwardInputIndex(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) { + const int inputSize = traverseSession->getInputSize(); + DicNode_InputStateG inputStateG; + inputStateG.mNeedsToUpdateInputStateG = false; // Don't use input info by default + const float spatialCost = Weighting::getSpatialCost(weighting, correctionType, + traverseSession, parentDicNode, dicNode, &inputStateG); + const float languageCost = Weighting::getLanguageCost(weighting, correctionType, + traverseSession, parentDicNode, dicNode, multiBigramMap); + const ErrorTypeUtils::ErrorType errorType = weighting->getErrorType(correctionType, + traverseSession, parentDicNode, dicNode); + profile(correctionType, dicNode); + if (inputStateG.mNeedsToUpdateInputStateG) { + dicNode->updateInputIndexG(&inputStateG); + } else { + dicNode->forwardInputIndex(0, getForwardInputCount(correctionType), + (correctionType == CT_TRANSPOSITION)); + } + dicNode->addCost(spatialCost, languageCost, weighting->needsToNormalizeCompoundDistance(), + inputSize, errorType); + if (CT_NEW_WORD_SPACE_OMISSION == correctionType) { + // When we are on a terminal, we save the current distance for evaluating + // when to auto-commit partial suggestions. + dicNode->saveNormalizedCompoundDistanceAfterFirstWordIfNoneYet(); + } +} + +/* static */ float Weighting::getSpatialCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + DicNode_InputStateG *const inputStateG) { + switch(correctionType) { + case CT_OMISSION: + return weighting->getOmissionCost(parentDicNode, dicNode); + case CT_ADDITIONAL_PROXIMITY: + // only used for typing + // TODO: Quit calling getMatchedCost(). + return weighting->getAdditionalProximityCost() + + weighting->getMatchedCost(traverseSession, dicNode, inputStateG); + case CT_SUBSTITUTION: + // only used for typing + // TODO: Quit calling getMatchedCost(). + return weighting->getSubstitutionCost() + + weighting->getMatchedCost(traverseSession, dicNode, inputStateG); + case CT_NEW_WORD_SPACE_OMISSION: + return weighting->getSpaceOmissionCost(traverseSession, dicNode, inputStateG); + case CT_MATCH: + return weighting->getMatchedCost(traverseSession, dicNode, inputStateG); + case CT_COMPLETION: + return weighting->getCompletionCost(traverseSession, dicNode); + case CT_TERMINAL: + return weighting->getTerminalSpatialCost(traverseSession, dicNode); + case CT_TERMINAL_INSERTION: + return weighting->getTerminalInsertionCost(traverseSession, dicNode); + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return weighting->getSpaceSubstitutionCost(traverseSession, dicNode); + case CT_INSERTION: + return weighting->getInsertionCost(traverseSession, parentDicNode, dicNode); + case CT_TRANSPOSITION: + return weighting->getTranspositionCost(traverseSession, parentDicNode, dicNode); + default: + return 0.0f; + } +} + +/* static */ float Weighting::getLanguageCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) { + switch(correctionType) { + case CT_OMISSION: + return 0.0f; + case CT_SUBSTITUTION: + return 0.0f; + case CT_NEW_WORD_SPACE_OMISSION: + return weighting->getNewWordBigramLanguageCost( + traverseSession, parentDicNode, multiBigramMap); + case CT_MATCH: + return 0.0f; + case CT_COMPLETION: + return 0.0f; + case CT_TERMINAL: { + const float languageImprobability = + DicNodeUtils::getBigramNodeImprobability( + traverseSession->getDictionaryStructurePolicy(), dicNode, multiBigramMap); + return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability); + } + case CT_TERMINAL_INSERTION: + return 0.0f; + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return weighting->getNewWordBigramLanguageCost( + traverseSession, parentDicNode, multiBigramMap); + case CT_INSERTION: + return 0.0f; + case CT_TRANSPOSITION: + return 0.0f; + default: + return 0.0f; + } +} + +/* static */ int Weighting::getForwardInputCount(const CorrectionType correctionType) { + switch(correctionType) { + case CT_OMISSION: + return 0; + case CT_ADDITIONAL_PROXIMITY: + return 1; + case CT_SUBSTITUTION: + return 1; + case CT_NEW_WORD_SPACE_OMISSION: + return 0; + case CT_MATCH: + return 1; + case CT_COMPLETION: + return 1; + case CT_TERMINAL: + return 0; + case CT_TERMINAL_INSERTION: + return 1; + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return 1; + case CT_INSERTION: + return 2; /* look ahead + skip the current char */ + case CT_TRANSPOSITION: + return 2; /* look ahead + skip the current char */ + default: + return 0; + } +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/policy/weighting.h b/app/src/main/jni/src/suggest/core/policy/weighting.h new file mode 100644 index 000000000..863c4eabe --- /dev/null +++ b/app/src/main/jni/src/suggest/core/policy/weighting.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WEIGHTING_H +#define LATINIME_WEIGHTING_H + +#include "defines.h" +#include "suggest/core/dictionary/error_type_utils.h" + +namespace latinime { + +class DicNode; +class DicTraverseSession; +struct DicNode_InputStateG; +class MultiBigramMap; + +class Weighting { + public: + static void addCostAndForwardInputIndex(const Weighting *const weighting, + const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, DicNode *const dicNode, + MultiBigramMap *const multiBigramMap); + + protected: + virtual float getTerminalSpatialCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getOmissionCost( + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + + virtual float getMatchedCost( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + DicNode_InputStateG *inputStateG) const = 0; + + virtual bool isProximityDicNode(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getTranspositionCost( + const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, + const DicNode *const dicNode) const = 0; + + virtual float getInsertionCost( + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + + virtual float getSpaceOmissionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, DicNode_InputStateG *const inputStateG) const = 0; + + virtual float getNewWordBigramLanguageCost( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) const = 0; + + virtual float getCompletionCost( + const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getTerminalInsertionCost( + const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual float getTerminalLanguageCost( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + float dicNodeLanguageImprobability) const = 0; + + virtual bool needsToNormalizeCompoundDistance() const = 0; + + virtual float getAdditionalProximityCost() const = 0; + + virtual float getSubstitutionCost() const = 0; + + virtual float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const = 0; + + virtual ErrorTypeUtils::ErrorType getErrorType(const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; + + Weighting() {} + virtual ~Weighting() {} + + private: + DISALLOW_COPY_AND_ASSIGN(Weighting); + + static float getSpatialCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + DicNode_InputStateG *const inputStateG); + static float getLanguageCost(const Weighting *const weighting, + const CorrectionType correctionType, const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap); + // TODO: Move to TypingWeighting and GestureWeighting? + static int getForwardInputCount(const CorrectionType correctionType); +}; +} // namespace latinime +#endif // LATINIME_WEIGHTING_H diff --git a/app/src/main/jni/src/suggest/core/result/suggested_word.h b/app/src/main/jni/src/suggest/core/result/suggested_word.h new file mode 100644 index 000000000..258a40eeb --- /dev/null +++ b/app/src/main/jni/src/suggest/core/result/suggested_word.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGESTED_WORD_H +#define LATINIME_SUGGESTED_WORD_H + +#include + +#include "defines.h" +#include "suggest/core/dictionary/dictionary.h" + +namespace latinime { + +class SuggestedWord { + public: + class Comparator { + public: + bool operator()(const SuggestedWord &left, const SuggestedWord &right) { + if (left.getScore() != right.getScore()) { + return left.getScore() > right.getScore(); + } + return left.getCodePointCount() < right.getCodePointCount(); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(Comparator); + }; + + SuggestedWord(const int *const codePoints, const int codePointCount, + const int score, const int type, const int indexToPartialCommit, + const int autoCommitFirstWordConfidence) + : mCodePoints(codePoints, codePoints + codePointCount), mScore(score), + mType(type), mIndexToPartialCommit(indexToPartialCommit), + mAutoCommitFirstWordConfidence(autoCommitFirstWordConfidence) {} + + const int *getCodePoint() const { + return &mCodePoints.at(0); + } + + int getCodePointCount() const { + return mCodePoints.size(); + } + + int getScore() const { + return mScore; + } + + int getType() const { + return mType; + } + + int getIndexToPartialCommit() const { + return mIndexToPartialCommit; + } + + int getAutoCommitFirstWordConfidence() const { + return mAutoCommitFirstWordConfidence; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(SuggestedWord); + + std::vector mCodePoints; + int mScore; + int mType; + int mIndexToPartialCommit; + int mAutoCommitFirstWordConfidence; +}; +} // namespace latinime +#endif /* LATINIME_SUGGESTED_WORD_H */ diff --git a/app/src/main/jni/src/suggest/core/result/suggestion_results.cpp b/app/src/main/jni/src/suggest/core/result/suggestion_results.cpp new file mode 100644 index 000000000..3756d1092 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/result/suggestion_results.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/result/suggestion_results.h" + +#include "utils/jni_data_utils.h" + +namespace latinime { + +void SuggestionResults::outputSuggestions(JNIEnv *env, jintArray outSuggestionCount, + jintArray outputCodePointsArray, jintArray outScoresArray, jintArray outSpaceIndicesArray, + jintArray outTypesArray, jintArray outAutoCommitFirstWordConfidenceArray, + jfloatArray outWeightOfLangModelVsSpatialModel) { + int outputIndex = 0; + while (!mSuggestedWords.empty()) { + const SuggestedWord &suggestedWord = mSuggestedWords.top(); + suggestedWord.getCodePointCount(); + const int start = outputIndex * MAX_WORD_LENGTH; + JniDataUtils::outputCodePoints(env, outputCodePointsArray, start, + MAX_WORD_LENGTH /* maxLength */, suggestedWord.getCodePoint(), + suggestedWord.getCodePointCount(), true /* needsNullTermination */); + JniDataUtils::putIntToArray(env, outScoresArray, outputIndex, suggestedWord.getScore()); + JniDataUtils::putIntToArray(env, outSpaceIndicesArray, outputIndex, + suggestedWord.getIndexToPartialCommit()); + JniDataUtils::putIntToArray(env, outTypesArray, outputIndex, suggestedWord.getType()); + if (mSuggestedWords.size() == 1) { + JniDataUtils::putIntToArray(env, outAutoCommitFirstWordConfidenceArray, 0 /* index */, + suggestedWord.getAutoCommitFirstWordConfidence()); + } + ++outputIndex; + mSuggestedWords.pop(); + } + JniDataUtils::putIntToArray(env, outSuggestionCount, 0 /* index */, outputIndex); + JniDataUtils::putFloatToArray(env, outWeightOfLangModelVsSpatialModel, 0 /* index */, + mWeightOfLangModelVsSpatialModel); +} + +void SuggestionResults::addPrediction(const int *const codePoints, const int codePointCount, + const int probability) { + if (probability == NOT_A_PROBABILITY) { + // Invalid word. + return; + } + addSuggestion(codePoints, codePointCount, probability, Dictionary::KIND_PREDICTION, + NOT_AN_INDEX, NOT_A_FIRST_WORD_CONFIDENCE); +} + +void SuggestionResults::addSuggestion(const int *const codePoints, const int codePointCount, + const int score, const int type, const int indexToPartialCommit, + const int autocimmitFirstWordConfindence) { + if (codePointCount <= 0 || codePointCount > MAX_WORD_LENGTH) { + // Invalid word. + AKLOGE("Invalid word is added to the suggestion results. codePointCount: %d", + codePointCount); + return; + } + if (getSuggestionCount() >= mMaxSuggestionCount) { + const SuggestedWord &mWorstSuggestion = mSuggestedWords.top(); + if (score > mWorstSuggestion.getScore() || (score == mWorstSuggestion.getScore() + && codePointCount < mWorstSuggestion.getCodePointCount())) { + mSuggestedWords.pop(); + } else { + return; + } + } + mSuggestedWords.push(SuggestedWord(codePoints, codePointCount, score, type, + indexToPartialCommit, autocimmitFirstWordConfindence)); +} + +void SuggestionResults::getSortedScores(int *const outScores) const { + auto copyOfSuggestedWords = mSuggestedWords; + while (!copyOfSuggestedWords.empty()) { + const SuggestedWord &suggestedWord = copyOfSuggestedWords.top(); + outScores[copyOfSuggestedWords.size() - 1] = suggestedWord.getScore(); + copyOfSuggestedWords.pop(); + } +} + +void SuggestionResults::dumpSuggestions() const { + AKLOGE("weight of language model vs spatial model: %f", mWeightOfLangModelVsSpatialModel); + std::vector suggestedWords; + auto copyOfSuggestedWords = mSuggestedWords; + while (!copyOfSuggestedWords.empty()) { + suggestedWords.push_back(copyOfSuggestedWords.top()); + copyOfSuggestedWords.pop(); + } + int index = 0; + for (auto it = suggestedWords.rbegin(); it != suggestedWords.rend(); ++it) { + DUMP_SUGGESTION(it->getCodePoint(), it->getCodePointCount(), index, it->getScore()); + index++; + } +} + +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/result/suggestion_results.h b/app/src/main/jni/src/suggest/core/result/suggestion_results.h new file mode 100644 index 000000000..738c78a9f --- /dev/null +++ b/app/src/main/jni/src/suggest/core/result/suggestion_results.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGESTION_RESULTS_H +#define LATINIME_SUGGESTION_RESULTS_H + +#include +#include + +#include "defines.h" +#include "jni.h" +#include "suggest/core/result/suggested_word.h" + +namespace latinime { + +class SuggestionResults { + public: + explicit SuggestionResults(const int maxSuggestionCount) + : mMaxSuggestionCount(maxSuggestionCount), + mWeightOfLangModelVsSpatialModel(NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL), + mSuggestedWords() {} + + // Returns suggestion count. + void outputSuggestions(JNIEnv *env, jintArray outSuggestionCount, jintArray outCodePointsArray, + jintArray outScoresArray, jintArray outSpaceIndicesArray, jintArray outTypesArray, + jintArray outAutoCommitFirstWordConfidenceArray, + jfloatArray outWeightOfLangModelVsSpatialModel); + void addPrediction(const int *const codePoints, const int codePointCount, const int score); + void addSuggestion(const int *const codePoints, const int codePointCount, + const int score, const int type, const int indexToPartialCommit, + const int autocimmitFirstWordConfindence); + void getSortedScores(int *const outScores) const; + void dumpSuggestions() const; + + void setWeightOfLangModelVsSpatialModel(const float weightOfLangModelVsSpatialModel) { + mWeightOfLangModelVsSpatialModel = weightOfLangModelVsSpatialModel; + } + + int getSuggestionCount() const { + return mSuggestedWords.size(); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestionResults); + + const int mMaxSuggestionCount; + float mWeightOfLangModelVsSpatialModel; + std::priority_queue< + SuggestedWord, std::vector, SuggestedWord::Comparator> mSuggestedWords; +}; +} // namespace latinime +#endif // LATINIME_SUGGESTION_RESULTS_H diff --git a/app/src/main/jni/src/suggest/core/result/suggestions_output_utils.cpp b/app/src/main/jni/src/suggest/core/result/suggestions_output_utils.cpp new file mode 100644 index 000000000..7c37241de --- /dev/null +++ b/app/src/main/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -0,0 +1,276 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/result/suggestions_output_utils.h" + +#include +#include + +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" +#include "suggest/core/policy/scoring.h" +#include "suggest/core/result/suggestion_results.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest_options.h" + +namespace latinime { + +const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; + +/* static */ void SuggestionsOutputUtils::outputSuggestions( + const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) { +#if DEBUG_EVALUATE_MOST_PROBABLE_STRING + const int terminalSize = 0; +#else + const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize(); +#endif + std::vector terminals(terminalSize); + for (int index = terminalSize - 1; index >= 0; --index) { + traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); + } + // Compute a weight of language model when an invalid weight is passed. + // NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1) is taken as an invalid value. + const float weightOfLangModelVsSpatialModelToOutputSuggestions = + (weightOfLangModelVsSpatialModel < 0.0f) + ? scoringPolicy->getAdjustedWeightOfLangModelVsSpatialModel(traverseSession, + terminals.data(), terminalSize) + : weightOfLangModelVsSpatialModel; + outSuggestionResults->setWeightOfLangModelVsSpatialModel( + weightOfLangModelVsSpatialModelToOutputSuggestions); + // Force autocorrection for obvious long multi-word suggestions when the top suggestion is + // a long multiple words suggestion. + // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. + const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop() + && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT + && !terminals.empty() && terminals.front().hasMultipleWords()); + // TODO: have partial commit work even with multiple pointers. + const bool outputSecondWordFirstLetterInputIndex = + traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); + const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()-> + getHeaderStructurePolicy()->shouldBoostExactMatches(); + + // Output suggestion results here + for (auto &terminalDicNode : terminals) { + outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode, + weightOfLangModelVsSpatialModelToOutputSuggestions, boostExactMatches, + forceCommitMultiWords, outputSecondWordFirstLetterInputIndex, outSuggestionResults); + } + scoringPolicy->getMostProbableString(traverseSession, + weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults); +} + +/* static */ bool SuggestionsOutputUtils::shouldBlockWord( + const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode, + const WordAttributes wordAttributes, const bool isLastWord) { + const bool currentWordExactMatch = + ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + // When we have to block offensive words, non-exact matched offensive words should not be + // output. + const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords(); + + const bool isBlockedOffensiveWord = shouldBlockOffensiveWords && + wordAttributes.isPossiblyOffensive(); + + // This function is called in two situations: + // + // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode + // of the search, and isLastWord will be true. + // "fuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true) + // In this case, if the current word is an exact match, we will always let the word + // through, even if the user is blocking offensive words (it's exactly what they typed!) + // + // 2) In the middle of the search, when we hit a terminal node, to decide whether or not + // to start a new search at root, to try to match the rest of the input. In this case, + // terminalDicNode will point to the terminal node we just hit, and isLastWord will be + // false. + // "fuckvthis" + // | + // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true) + // + // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this" + // when offensive words are blocked would be a bad idea). + // + // In the case of a multi-word correction where the offensive word is typed last (eg. + // for the input "allfuck"), this function will be called with isLastWord==true, but + // currentWordExactMatch==false. So we are OK in this case as well. + // "allfuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false) + if (isLastWord && currentWordExactMatch) { + return false; + } else { + return isBlockedOffensiveWord; + } +} + +/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( + const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, + const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel, + const bool boostExactMatches, const bool forceCommitMultiWords, + const bool outputSecondWordFirstLetterInputIndex, + SuggestionResults *const outSuggestionResults) { + if (DEBUG_GEO_FULL) { + terminalDicNode->dump("OUT:"); + } + const float doubleLetterCost = + scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); + const float compoundDistance = + terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel) + + doubleLetterCost; + const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy() + ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(), + terminalDicNode->getWordId(), nullptr /* multiBigramMap */); + const bool isExactMatch = + ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + const bool isExactMatchWithIntentionalOmission = + ErrorTypeUtils::isExactMatchWithIntentionalOmission( + terminalDicNode->getContainedErrorTypes()); + // TODO: Decide whether the word should be auto-corrected or not here. + const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent( + terminalDicNode->getContainedErrorTypes()); + const int outputTypeFlags = + (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) + | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) + | (isExactMatchWithIntentionalOmission ? + Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0) + | (isAppropriateForAutoCorrection ? + Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0); + // Entries that are blacklisted or do not represent a word should not be output. + const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()); + + const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(), + terminalDicNode, wordAttributes, true /* isLastWord */); + + // Increase output score of top typing suggestion to ensure autocorrection. + // TODO: Better integration with java side autocorrection logic. + const int finalScore = scoringPolicy->calculateFinalScore( + compoundDistance, traverseSession->getInputSize(), + terminalDicNode->getContainedErrorTypes(), + (forceCommitMultiWords && terminalDicNode->hasMultipleWords()), + boostExactMatches, wordAttributes.getProbability() == 0); + + // Don't output invalid or blocked offensive words. However, we still need to submit their + // shortcuts if any. + if (isValidWord && !shouldBlockThisWord) { + int codePoints[MAX_WORD_LENGTH]; + terminalDicNode->outputResult(codePoints); + const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? + terminalDicNode->getSecondWordFirstInputIndex( + traverseSession->getProximityInfoState(0)) : + NOT_AN_INDEX; + outSuggestionResults->addSuggestion(codePoints, + terminalDicNode->getTotalNodeCodePointCount(), + finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags, + indexToPartialCommit, computeFirstWordConfidence(terminalDicNode)); + } + + // Output shortcuts. + // Shortcut is not supported for multiple words suggestions. + // TODO: Check shortcuts during traversal for multiple words suggestions. + if (!terminalDicNode->hasMultipleWords()) { + BinaryDictionaryShortcutIterator shortcutIt = + traverseSession->getDictionaryStructurePolicy()->getShortcutIterator( + terminalDicNode->getWordId()); + const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); + outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults); + } +} + +/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence( + const DicNode *const terminalDicNode) { + // Get the number of spaces in the first suggestion + const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); + // Get the number of characters in the first suggestion + const int length = terminalDicNode->getTotalNodeCodePointCount(); + // Get the distance for the first word of the suggestion + const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); + + // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. + // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or + // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means + // we are very confident. + // Expected space count is 1 ~ 5 + static const int MIN_EXPECTED_SPACE_COUNT = 1; + static const int MAX_EXPECTED_SPACE_COUNT = 5; + // Expected length is about 4 ~ 30 + static const int MIN_EXPECTED_LENGTH = 4; + static const int MAX_EXPECTED_LENGTH = 30; + // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 + static const float MIN_EXPECTED_DISTANCE = 0.0; + static const float MAX_EXPECTED_DISTANCE = 2.0; + // This is not strict: it's where most stuff will be falling, but it's still fine if it's + // outside these values. We want to output a value that reflects all of these. Each factor + // contributes a bit. + + // We need at least a space. + if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; + + // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 + // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the + // weight of the distance. Clamp to avoid overflows. + const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE + : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; + const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT + * (MAX_EXPECTED_DISTANCE - clampedDistance) + / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); + // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no + // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the + // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. + const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT + * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); + // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no + // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the + // weight of the space count. + const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT + * (spaceCount - MIN_EXPECTED_SPACE_COUNT) + / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); + + return distanceContribution + lengthContribution + spaceContribution; +} + +/* static */ void SuggestionsOutputUtils::outputShortcuts( + BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore, + const bool sameAsTyped, SuggestionResults *const outSuggestionResults) { + int shortcutTarget[MAX_WORD_LENGTH]; + while (shortcutIt->hasNextShortcutTarget()) { + bool isWhilelist; + int shortcutTargetStringLength; + shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetStringLength, &isWhilelist); + int shortcutScore; + int kind; + if (isWhilelist && sameAsTyped) { + shortcutScore = S_INT_MAX; + kind = Dictionary::KIND_WHITELIST; + } else { + // shortcut entry's score == its base entry's score - 1 + shortcutScore = finalScore; + // Protection against int underflow + shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1; + kind = Dictionary::KIND_SHORTCUT; + } + outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength, + std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX, + NOT_A_FIRST_WORD_CONFIDENCE); + } +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/result/suggestions_output_utils.h b/app/src/main/jni/src/suggest/core/result/suggestions_output_utils.h new file mode 100644 index 000000000..bcb75a483 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/result/suggestions_output_utils.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGESTIONS_OUTPUT_UTILS +#define LATINIME_SUGGESTIONS_OUTPUT_UTILS + +#include "defines.h" +#include "dictionary/property/word_attributes.h" + +namespace latinime { + +class BinaryDictionaryShortcutIterator; +class DicNode; +class DicTraverseSession; +class Scoring; +class SuggestOptions; +class SuggestionResults; + +class SuggestionsOutputUtils { + public: + /** + * Returns true if we should block the incoming word, in the context of the user's + * preferences to include or not include possibly offensive words + */ + static bool shouldBlockWord(const SuggestOptions *const suggestOptions, + const DicNode *const terminalDicNode, const WordAttributes wordAttributes, + const bool isLastWord); + /** + * Outputs the final list of suggestions (i.e., terminal nodes). + */ + static void outputSuggestions(const Scoring *const scoringPolicy, + DicTraverseSession *traverseSession, const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestionsOutputUtils); + + // Inputs longer than this will autocorrect if the suggestion is multi-word + static const int MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT; + + static void outputSuggestionsOfDicNode(const Scoring *const scoringPolicy, + DicTraverseSession *traverseSession, const DicNode *const terminalDicNode, + const float weightOfLangModelVsSpatialModel, const bool boostExactMatches, + const bool forceCommitMultiWords, const bool outputSecondWordFirstLetterInputIndex, + SuggestionResults *const outSuggestionResults); + static void outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt, + const int finalScore, const bool sameAsTyped, + SuggestionResults *const outSuggestionResults); + static int computeFirstWordConfidence(const DicNode *const terminalDicNode); +}; +} // namespace latinime +#endif // LATINIME_SUGGESTIONS_OUTPUT_UTILS diff --git a/app/src/main/jni/src/suggest/core/session/dic_traverse_session.cpp b/app/src/main/jni/src/suggest/core/session/dic_traverse_session.cpp new file mode 100644 index 000000000..d7dd5a02d --- /dev/null +++ b/app/src/main/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/session/dic_traverse_session.h" + +#include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/ngram_context.h" +#include "suggest/core/dictionary/dictionary.h" + +namespace latinime { + +// 256K bytes threshold is heuristically used to distinguish dictionaries containing many unigrams +// (e.g. main dictionary) from small dictionaries (e.g. contacts...) +const int DicTraverseSession::DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION = + 256 * 1024; + +void DicTraverseSession::init(const Dictionary *const dictionary, + const NgramContext *const ngramContext, const SuggestOptions *const suggestOptions) { + mDictionary = dictionary; + mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy() + ->getMultiWordCostMultiplier(); + mSuggestOptions = suggestOptions; + mPrevWordIdCount = ngramContext->getPrevWordIds(getDictionaryStructurePolicy(), + &mPrevWordIdArray, true /* tryLowerCaseSearch */).size(); +} + +void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo, + const int *inputCodePoints, const int inputSize, const int *const inputXs, + const int *const inputYs, const int *const times, const int *const pointerIds, + const float maxSpatialDistance, const int maxPointerCount) { + mProximityInfo = pInfo; + mMaxPointerCount = maxPointerCount; + initializeProximityInfoStates(inputCodePoints, inputXs, inputYs, times, pointerIds, inputSize, + maxSpatialDistance, maxPointerCount); +} + +const DictionaryStructureWithBufferPolicy *DicTraverseSession::getDictionaryStructurePolicy() + const { + return mDictionary->getDictionaryStructurePolicy(); +} + +void DicTraverseSession::resetCache(const int thresholdForNextActiveDicNodes, const int maxWords) { + mDicNodesCache.reset(thresholdForNextActiveDicNodes /* nextActiveSize */, + maxWords /* terminalSize */); + mMultiBigramMap.clear(); +} + +void DicTraverseSession::initializeProximityInfoStates(const int *const inputCodePoints, + const int *const inputXs, const int *const inputYs, const int *const times, + const int *const pointerIds, const int inputSize, const float maxSpatialDistance, + const int maxPointerCount) { + ASSERT(1 <= maxPointerCount && maxPointerCount <= MAX_POINTER_COUNT_G); + mInputSize = 0; + for (int i = 0; i < maxPointerCount; ++i) { + mProximityInfoStates[i].initInputParams(i, maxSpatialDistance, getProximityInfo(), + inputCodePoints, inputSize, inputXs, inputYs, times, pointerIds, + // Right now the line below is trying to figure out whether this is a gesture by + // looking at the pointer count and assuming whatever is above the cutoff is + // a gesture and whatever is below is type. This is hacky and incorrect, we + // should pass the correct information instead. + maxPointerCount == MAX_POINTER_COUNT_G, + getDictionaryStructurePolicy()->getHeaderStructurePolicy()->getLocale()); + mInputSize += mProximityInfoStates[i].size(); + } +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/session/dic_traverse_session.h b/app/src/main/jni/src/suggest/core/session/dic_traverse_session.h new file mode 100644 index 000000000..f5fcfddcd --- /dev/null +++ b/app/src/main/jni/src/suggest/core/session/dic_traverse_session.h @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DIC_TRAVERSE_SESSION_H +#define LATINIME_DIC_TRAVERSE_SESSION_H + +#include + +#include "defines.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "jni.h" +#include "suggest/core/dicnode/dic_nodes_cache.h" +#include "suggest/core/layout/proximity_info_state.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class Dictionary; +class DictionaryStructureWithBufferPolicy; +class NgramContext; +class ProximityInfo; +class SuggestOptions; + +class DicTraverseSession { + public: + + // A factory method for DicTraverseSession + static AK_FORCE_INLINE void *getSessionInstance(JNIEnv *env, jstring localeStr, + jlong dictSize) { + // To deal with the trade-off between accuracy and memory space, large cache is used for + // dictionaries larger that the threshold + return new DicTraverseSession(env, localeStr, + dictSize >= DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION); + } + + static AK_FORCE_INLINE void releaseSessionInstance(DicTraverseSession *traverseSession) { + delete traverseSession; + } + + AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache) + : mPrevWordIdCount(0), mProximityInfo(nullptr), mDictionary(nullptr), + mSuggestOptions(nullptr), mDicNodesCache(usesLargeCache), mMultiBigramMap(), + mInputSize(0), mMaxPointerCount(1), mMultiWordCostMultiplier(1.0f) { + // NOTE: mProximityInfoStates is an array of instances. + // No need to initialize it explicitly here. + } + + // Non virtual inline destructor -- never inherit this class + AK_FORCE_INLINE ~DicTraverseSession() {} + + void init(const Dictionary *dictionary, const NgramContext *const ngramContext, + const SuggestOptions *const suggestOptions); + // TODO: Remove and merge into init + void setupForGetSuggestions(const ProximityInfo *pInfo, const int *inputCodePoints, + const int inputSize, const int *const inputXs, const int *const inputYs, + const int *const times, const int *const pointerIds, const float maxSpatialDistance, + const int maxPointerCount); + void resetCache(const int thresholdForNextActiveDicNodes, const int maxWords); + + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const; + + //-------------------- + // getters and setters + //-------------------- + const ProximityInfo *getProximityInfo() const { return mProximityInfo; } + const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; } + const WordIdArrayView getPrevWordIds() const { + return WordIdArrayView::fromArray(mPrevWordIdArray).limit(mPrevWordIdCount); + } + DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; } + MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; } + const ProximityInfoState *getProximityInfoState(int id) const { + return &mProximityInfoStates[id]; + } + int getInputSize() const { return mInputSize; } + + bool isOnlyOnePointerUsed(int *pointerId) const { + // Not in the dictionary word + int usedPointerCount = 0; + int usedPointerId = 0; + for (int i = 0; i < mMaxPointerCount; ++i) { + if (mProximityInfoStates[i].isUsed()) { + ++usedPointerCount; + usedPointerId = i; + } + } + if (usedPointerCount != 1) { + return false; + } + if (pointerId) { + *pointerId = usedPointerId; + } + return true; + } + + ProximityType getProximityTypeG(const DicNode *const dicNode, const int childCodePoint) const { + ProximityType proximityType = UNRELATED_CHAR; + for (int i = 0; i < MAX_POINTER_COUNT_G; ++i) { + if (!mProximityInfoStates[i].isUsed()) { + continue; + } + const int pointerId = dicNode->getInputIndex(i); + proximityType = mProximityInfoStates[i].getProximityTypeG(pointerId, childCodePoint); + ASSERT(proximityType == UNRELATED_CHAR || proximityType == MATCH_CHAR); + // TODO: Make this more generic + // Currently we assume there are only two types here -- UNRELATED_CHAR + // and MATCH_CHAR + if (proximityType != UNRELATED_CHAR) { + return proximityType; + } + } + return proximityType; + } + + AK_FORCE_INLINE bool isCacheBorderForTyping(const int inputSize) const { + return mDicNodesCache.isCacheBorderForTyping(inputSize); + } + + /** + * Returns whether or not it is possible to continue suggestion from the previous search. + */ + // TODO: Remove. No need to check once the session is fully implemented. + bool isContinuousSuggestionPossible() const { + if (!mDicNodesCache.hasCachedDicNodesForContinuousSuggestion()) { + return false; + } + ASSERT(mMaxPointerCount <= MAX_POINTER_COUNT_G); + for (int i = 0; i < mMaxPointerCount; ++i) { + const ProximityInfoState *const pInfoState = getProximityInfoState(i); + // If a proximity info state is not continuous suggestion possible, + // do not continue searching. + if (pInfoState->isUsed() && !pInfoState->isContinuousSuggestionPossible()) { + return false; + } + } + return true; + } + + bool isTouchPositionCorrectionEnabled() const { + return mProximityInfoStates[0].touchPositionCorrectionEnabled(); + } + + float getMultiWordCostMultiplier() const { + return mMultiWordCostMultiplier; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DicTraverseSession); + // threshold to start caching + static const int CACHE_START_INPUT_LENGTH_THRESHOLD; + static const int DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_SUGGESTION; + void initializeProximityInfoStates(const int *const inputCodePoints, const int *const inputXs, + const int *const inputYs, const int *const times, const int *const pointerIds, + const int inputSize, const float maxSpatialDistance, const int maxPointerCount); + + WordIdArray mPrevWordIdArray; + size_t mPrevWordIdCount; + const ProximityInfo *mProximityInfo; + const Dictionary *mDictionary; + const SuggestOptions *mSuggestOptions; + + DicNodesCache mDicNodesCache; + // Temporary cache for bigram frequencies + MultiBigramMap mMultiBigramMap; + ProximityInfoState mProximityInfoStates[MAX_POINTER_COUNT_G]; + + int mInputSize; + int mMaxPointerCount; + + ///////////////////////////////// + // Configuration per dictionary + float mMultiWordCostMultiplier; + +}; +} // namespace latinime +#endif // LATINIME_DIC_TRAVERSE_SESSION_H diff --git a/app/src/main/jni/src/suggest/core/suggest.cpp b/app/src/main/jni/src/suggest/core/suggest.cpp new file mode 100644 index 000000000..52fa5a5db --- /dev/null +++ b/app/src/main/jni/src/suggest/core/suggest.cpp @@ -0,0 +1,444 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/suggest.h" + +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/word_attributes.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_priority_queue.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/layout/proximity_info.h" +#include "suggest/core/policy/traversal.h" +#include "suggest/core/policy/weighting.h" +#include "suggest/core/result/suggestions_output_utils.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest_options.h" +#include "utils/profiler.h" + +namespace latinime { + +// Initialization of class constants. +const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2; + +/** + * Returns a set of suggestions for the given input touch points. The commitPoint argument indicates + * whether to prematurely commit the suggested words up to the given point for sentence-level + * suggestion. + * + * Note: Currently does not support concurrent calls across threads. Continuous suggestion is + * automatically activated for sequential calls that share the same starting input. + * TODO: Stop detecting continuous suggestion. Start using traverseSession instead. + */ +void Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession, + int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, + int inputSize, const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const { + PROF_INIT; + PROF_TIMER_START(0); + const float maxSpatialDistance = TRAVERSAL->getMaxSpatialDistance(); + DicTraverseSession *tSession = static_cast(traverseSession); + tSession->setupForGetSuggestions(pInfo, inputCodePoints, inputSize, inputXs, inputYs, times, + pointerIds, maxSpatialDistance, TRAVERSAL->getMaxPointerCount()); + // TODO: Add the way to evaluate cache + + initializeSearch(tSession); + PROF_TIMER_END(0); + PROF_TIMER_START(1); + + // keep expanding search dicNodes until all have terminated. + while (tSession->getDicTraverseCache()->activeSize() > 0) { + expandCurrentDicNodes(tSession); + tSession->getDicTraverseCache()->advanceActiveDicNodes(); + tSession->getDicTraverseCache()->advanceInputIndex(inputSize); + } + PROF_TIMER_END(1); + PROF_TIMER_START(2); + SuggestionsOutputUtils::outputSuggestions( + SCORING, tSession, weightOfLangModelVsSpatialModel, outSuggestionResults); + PROF_TIMER_END(2); +} + +/** + * Initializes the search at the root of the lexicon trie. Note that when possible the search will + * continue suggestion from where it left off during the last call. + */ +void Suggest::initializeSearch(DicTraverseSession *traverseSession) const { + if (!traverseSession->getProximityInfoState(0)->isUsed()) { + return; + } + + if (traverseSession->getInputSize() > MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE + && traverseSession->isContinuousSuggestionPossible()) { + // Continue suggestion + traverseSession->getDicTraverseCache()->continueSearch(); + } else { + // Restart recognition at the root. + traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize(), + traverseSession->getSuggestOptions()->weightForLocale()), + TRAVERSAL->getTerminalCacheSize()); + // Create a new dic node here + DicNode rootNode; + DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(), + traverseSession->getPrevWordIds(), &rootNode); + traverseSession->getDicTraverseCache()->copyPushActive(&rootNode); + } +} + +/** + * Expands the dicNodes in the current search priority queue by advancing to the possible child + * nodes based on the next touch point(s) (or no touch points for lookahead) + */ +void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { + const int inputSize = traverseSession->getInputSize(); + DicNodeVector childDicNodes(TRAVERSAL->getDefaultExpandDicNodeSize()); + DicNode correctionDicNode; + + // TODO: Find more efficient caching + const bool shouldDepthLevelCache = TRAVERSAL->shouldDepthLevelCache(traverseSession); + if (shouldDepthLevelCache) { + traverseSession->getDicTraverseCache()->updateLastCachedInputIndex(); + } + if (DEBUG_CACHE) { + AKLOGI("expandCurrentDicNodes depth level cache = %d, inputSize = %d", + shouldDepthLevelCache, inputSize); + } + while (traverseSession->getDicTraverseCache()->activeSize() > 0) { + DicNode dicNode; + traverseSession->getDicTraverseCache()->popActive(&dicNode); + if (dicNode.isTotalInputSizeExceedingLimit()) { + return; + } + childDicNodes.clear(); + const int point0Index = dicNode.getInputIndex(0); + const bool canDoLookAheadCorrection = + TRAVERSAL->canDoLookAheadCorrection(traverseSession, &dicNode); + const bool isLookAheadCorrection = canDoLookAheadCorrection + && traverseSession->getDicTraverseCache()-> + isLookAheadCorrectionInputIndex(static_cast(point0Index)); + const bool isCompletion = dicNode.isCompletion(inputSize); + + const bool shouldNodeLevelCache = + TRAVERSAL->shouldNodeLevelCache(traverseSession, &dicNode); + if (shouldDepthLevelCache || shouldNodeLevelCache) { + if (DEBUG_CACHE) { + dicNode.dump("PUSH_CACHE"); + } + traverseSession->getDicTraverseCache()->copyPushContinue(&dicNode); + dicNode.setCached(); + } + + if (dicNode.isInDigraph()) { + // Finish digraph handling if the node is in the middle of a digraph expansion. + processDicNodeAsDigraph(traverseSession, &dicNode); + } else if (isLookAheadCorrection) { + // The algorithm maintains a small set of "deferred" nodes that have not consumed the + // latest touch point yet. These are needed to apply look-ahead correction operations + // that require special handling of the latest touch point. For example, with insertions + // (e.g., "thiis" -> "this") the latest touch point should not be consumed at all. + processDicNodeAsTransposition(traverseSession, &dicNode); + processDicNodeAsInsertion(traverseSession, &dicNode); + } else { // !isLookAheadCorrection + // Only consider typing error corrections if the normalized compound distance is + // below a spatial distance threshold. + // NOTE: the threshold may need to be updated if scoring model changes. + // TODO: Remove. Do not prune node here. + const bool allowsErrorCorrections = TRAVERSAL->allowsErrorCorrections(&dicNode); + // Process for handling space substitution (e.g., hevis => he is) + if (TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) { + createNextWordDicNode(traverseSession, &dicNode, true /* spaceSubstitution */); + } + + DicNodeUtils::getAllChildDicNodes( + &dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes); + + const int childDicNodesSize = childDicNodes.getSizeAndLock(); + for (int i = 0; i < childDicNodesSize; ++i) { + DicNode *const childDicNode = childDicNodes[i]; + if (isCompletion) { + // Handle forward lookahead when the lexicon letter exceeds the input size. + processDicNodeAsMatch(traverseSession, childDicNode); + continue; + } + if (DigraphUtils::hasDigraphForCodePoint( + traverseSession->getDictionaryStructurePolicy() + ->getHeaderStructurePolicy(), + childDicNode->getNodeCodePoint())) { + correctionDicNode.initByCopy(childDicNode); + correctionDicNode.advanceDigraphIndex(); + processDicNodeAsDigraph(traverseSession, &correctionDicNode); + } + if (TRAVERSAL->isOmission(traverseSession, &dicNode, childDicNode, + allowsErrorCorrections)) { + // TODO: (Gesture) Change weight between omission and substitution errors + // TODO: (Gesture) Terminal node should not be handled as omission + correctionDicNode.initByCopy(childDicNode); + processDicNodeAsOmission(traverseSession, &correctionDicNode); + } + const ProximityType proximityType = TRAVERSAL->getProximityType( + traverseSession, &dicNode, childDicNode); + switch (proximityType) { + // TODO: Consider the difference of proximityType here + case MATCH_CHAR: + case PROXIMITY_CHAR: + processDicNodeAsMatch(traverseSession, childDicNode); + break; + case ADDITIONAL_PROXIMITY_CHAR: + if (allowsErrorCorrections) { + processDicNodeAsAdditionalProximityChar(traverseSession, &dicNode, + childDicNode); + } + break; + case SUBSTITUTION_CHAR: + if (allowsErrorCorrections) { + processDicNodeAsSubstitution(traverseSession, &dicNode, childDicNode); + } + break; + case UNRELATED_CHAR: + // Just drop this dicNode and do nothing. + break; + default: + // Just drop this dicNode and do nothing. + break; + } + } + + // Push the dicNode for look-ahead correction + if (allowsErrorCorrections && canDoLookAheadCorrection) { + traverseSession->getDicTraverseCache()->copyPushNextActive(&dicNode); + } + } + } +} + +void Suggest::processTerminalDicNode( + DicTraverseSession *traverseSession, DicNode *dicNode) const { + if (dicNode->getCompoundDistance() >= static_cast(MAX_VALUE_FOR_WEIGHTING)) { + return; + } + if (!dicNode->isTerminalDicNode()) { + return; + } + if (dicNode->shouldBeFilteredBySafetyNetForBigram()) { + return; + } + if (!dicNode->hasMatchedOrProximityCodePoints()) { + return; + } + // Create a non-cached node here. + DicNode terminalDicNode(*dicNode); + if (TRAVERSAL->needsToTraverseAllUserInput() + && dicNode->getInputIndex(0) < traverseSession->getInputSize()) { + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL_INSERTION, traverseSession, 0, + &terminalDicNode, traverseSession->getMultiBigramMap()); + } + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL, traverseSession, 0, + &terminalDicNode, traverseSession->getMultiBigramMap()); + traverseSession->getDicTraverseCache()->copyPushTerminal(&terminalDicNode); +} + +/** + * Adds the expanded dicNode to the next search priority queue. Also creates an additional next word + * (by the space omission error correction) search path if input dicNode is on a terminal. + */ +void Suggest::processExpandedDicNode( + DicTraverseSession *traverseSession, DicNode *dicNode) const { + processTerminalDicNode(traverseSession, dicNode); + if (dicNode->getCompoundDistance() < static_cast(MAX_VALUE_FOR_WEIGHTING)) { + if (TRAVERSAL->isSpaceOmissionTerminal(traverseSession, dicNode)) { + createNextWordDicNode(traverseSession, dicNode, false /* spaceSubstitution */); + } + const int allowsLookAhead = !(dicNode->hasMultipleWords() + && dicNode->isCompletion(traverseSession->getInputSize())); + if (dicNode->hasChildren() && allowsLookAhead) { + traverseSession->getDicTraverseCache()->copyPushNextActive(dicNode); + } + } +} + +void Suggest::processDicNodeAsMatch(DicTraverseSession *traverseSession, + DicNode *childDicNode) const { + weightChildNode(traverseSession, childDicNode); + processExpandedDicNode(traverseSession, childDicNode); +} + +void Suggest::processDicNodeAsAdditionalProximityChar(DicTraverseSession *traverseSession, + DicNode *dicNode, DicNode *childDicNode) const { + // Note: Most types of corrections don't need to look up the bigram information since they do + // not treat the node as a terminal. There is no need to pass the bigram map in these cases. + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_ADDITIONAL_PROXIMITY, + traverseSession, dicNode, childDicNode, 0 /* multiBigramMap */); + processExpandedDicNode(traverseSession, childDicNode); +} + +void Suggest::processDicNodeAsSubstitution(DicTraverseSession *traverseSession, + DicNode *dicNode, DicNode *childDicNode) const { + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_SUBSTITUTION, traverseSession, + dicNode, childDicNode, 0 /* multiBigramMap */); + processExpandedDicNode(traverseSession, childDicNode); +} + +// Process the DicNode codepoint as a digraph. This means that composite glyphs like the German +// u-umlaut is expanded to the transliteration "ue". Note that this happens in parallel with +// the normal non-digraph traversal, so both "uber" and "ueber" can be corrected to "[u-umlaut]ber". +void Suggest::processDicNodeAsDigraph(DicTraverseSession *traverseSession, + DicNode *childDicNode) const { + weightChildNode(traverseSession, childDicNode); + childDicNode->advanceDigraphIndex(); + processExpandedDicNode(traverseSession, childDicNode); +} + +/** + * Handle the dicNode as an omission error (e.g., ths => this). Skip the current letter and consider + * matches for all possible next letters. Note that just skipping the current letter without any + * other conditions tends to flood the search DicNodes cache with omission DicNodes. Instead, check + * the possible *next* letters after the omission to better limit search to plausible omissions. + * Note that apostrophes are handled as omissions. + */ +void Suggest::processDicNodeAsOmission( + DicTraverseSession *traverseSession, DicNode *dicNode) const { + DicNodeVector childDicNodes; + DicNodeUtils::getAllChildDicNodes( + dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes); + + const int size = childDicNodes.getSizeAndLock(); + for (int i = 0; i < size; i++) { + DicNode *const childDicNode = childDicNodes[i]; + // Treat this word as omission + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_OMISSION, traverseSession, + dicNode, childDicNode, 0 /* multiBigramMap */); + weightChildNode(traverseSession, childDicNode); + if (!TRAVERSAL->isPossibleOmissionChildNode(traverseSession, dicNode, childDicNode)) { + continue; + } + processExpandedDicNode(traverseSession, childDicNode); + } +} + +/** + * Handle the dicNode as an insertion error (e.g., thiis => this). Skip the current touch point and + * consider matches for the next touch point. + */ +void Suggest::processDicNodeAsInsertion(DicTraverseSession *traverseSession, + DicNode *dicNode) const { + const int16_t pointIndex = dicNode->getInputIndex(0); + DicNodeVector childDicNodes; + DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getDictionaryStructurePolicy(), + &childDicNodes); + const int size = childDicNodes.getSizeAndLock(); + for (int i = 0; i < size; i++) { + if (traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex + 1) + != childDicNodes[i]->getNodeCodePoint()) { + continue; + } + DicNode *const childDicNode = childDicNodes[i]; + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_INSERTION, traverseSession, + dicNode, childDicNode, 0 /* multiBigramMap */); + processExpandedDicNode(traverseSession, childDicNode); + } +} + +/** + * Handle the dicNode as a transposition error (e.g., thsi => this). Swap the next two touch points. + */ +void Suggest::processDicNodeAsTransposition(DicTraverseSession *traverseSession, + DicNode *dicNode) const { + const int16_t pointIndex = dicNode->getInputIndex(0); + DicNodeVector childDicNodes1; + DicNodeVector childDicNodes2; + DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getDictionaryStructurePolicy(), + &childDicNodes1); + const int childSize1 = childDicNodes1.getSizeAndLock(); + for (int i = 0; i < childSize1; i++) { + const ProximityType matchedId1 = traverseSession->getProximityInfoState(0) + ->getProximityType(pointIndex + 1, childDicNodes1[i]->getNodeCodePoint(), + true /* checkProximityChars */); + if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId1)) { + continue; + } + if (childDicNodes1[i]->hasChildren()) { + childDicNodes2.clear(); + DicNodeUtils::getAllChildDicNodes(childDicNodes1[i], + traverseSession->getDictionaryStructurePolicy(), &childDicNodes2); + const int childSize2 = childDicNodes2.getSizeAndLock(); + for (int j = 0; j < childSize2; j++) { + DicNode *const childDicNode2 = childDicNodes2[j]; + const ProximityType matchedId2 = traverseSession->getProximityInfoState(0) + ->getProximityType(pointIndex, childDicNode2->getNodeCodePoint(), + true /* checkProximityChars */); + if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId2)) { + continue; + } + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TRANSPOSITION, + traverseSession, childDicNodes1[i], childDicNode2, 0 /* multiBigramMap */); + processExpandedDicNode(traverseSession, childDicNode2); + } + } + } +} + +/** + * Weight child dicNode by aligning it to the key + */ +void Suggest::weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const { + const int inputSize = traverseSession->getInputSize(); + if (dicNode->isCompletion(inputSize)) { + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_COMPLETION, traverseSession, + 0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */); + } else { + Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_MATCH, traverseSession, + 0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */); + } +} + +/** + * Creates a new dicNode that represents a space insertion at the end of the input dicNode. Also + * incorporates the unigram / bigram score for the ending word into the new dicNode. + */ +void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode, + const bool spaceSubstitution) const { + const WordAttributes wordAttributes = + traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext( + dicNode->getPrevWordIds(), dicNode->getWordId(), + traverseSession->getMultiBigramMap()); + if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(), + dicNode, wordAttributes, false /* isLastWord */)) { + return; + } + + if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) { + return; + } + + // Create a non-cached node here. + DicNode newDicNode; + DicNodeUtils::initAsRootWithPreviousWord( + traverseSession->getDictionaryStructurePolicy(), dicNode, &newDicNode); + const CorrectionType correctionType = spaceSubstitution ? + CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMISSION; + Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode, + &newDicNode, traverseSession->getMultiBigramMap()); + if (newDicNode.getCompoundDistance() < static_cast(MAX_VALUE_FOR_WEIGHTING)) { + // newDicNode is worth continuing to traverse. + // CAVEAT: This pruning is important for speed. Remove this when we can afford not to prune + // here because here is not the right place to do pruning. Pruning should take place only + // in DicNodePriorityQueue. + traverseSession->getDicTraverseCache()->copyPushNextActive(&newDicNode); + } +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/core/suggest.h b/app/src/main/jni/src/suggest/core/suggest.h new file mode 100644 index 000000000..65d5918cf --- /dev/null +++ b/app/src/main/jni/src/suggest/core/suggest.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_IMPL_H +#define LATINIME_SUGGEST_IMPL_H + +#include "defines.h" +#include "suggest/core/suggest_interface.h" +#include "suggest/core/policy/suggest_policy.h" + +namespace latinime { + +// Naming convention +// - Distance: "Weighted" edit distance -- used both for spatial and language. +// - Compound Distance: Spatial Distance + Language Distance -- used for pruning and scoring +// - Cost: delta/diff for Distance -- used both for spatial and language +// - Length: "Non-weighted" -- used only for spatial +// - Probability: "Non-weighted" -- used only for language +// - Score: Final calibrated score based on the compound distance, which is sent to java as the +// priority of a suggested word + +class DicNode; +class DicTraverseSession; +class ProximityInfo; +class Scoring; +class SuggestionResults; +class Traversal; +class Weighting; + +class Suggest : public SuggestInterface { + public: + AK_FORCE_INLINE Suggest(const SuggestPolicy *const suggestPolicy) + : TRAVERSAL(suggestPolicy ? suggestPolicy->getTraversal() : nullptr), + SCORING(suggestPolicy ? suggestPolicy->getScoring() : nullptr), + WEIGHTING(suggestPolicy ? suggestPolicy->getWeighting() : nullptr) {} + AK_FORCE_INLINE virtual ~Suggest() {} + void getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, + int *times, int *pointerIds, int *inputCodePoints, int inputSize, + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Suggest); + void createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode, + const bool spaceSubstitution) const; + void initializeSearch(DicTraverseSession *traverseSession) const; + void expandCurrentDicNodes(DicTraverseSession *traverseSession) const; + void processTerminalDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processExpandedDicNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void weightChildNode(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processDicNodeAsOmission(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processDicNodeAsDigraph(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processDicNodeAsTransposition(DicTraverseSession *traverseSession, + DicNode *dicNode) const; + void processDicNodeAsInsertion(DicTraverseSession *traverseSession, DicNode *dicNode) const; + void processDicNodeAsAdditionalProximityChar(DicTraverseSession *traverseSession, + DicNode *dicNode, DicNode *childDicNode) const; + void processDicNodeAsSubstitution(DicTraverseSession *traverseSession, DicNode *dicNode, + DicNode *childDicNode) const; + void processDicNodeAsMatch(DicTraverseSession *traverseSession, + DicNode *childDicNode) const; + + static const int MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE; + + const Traversal *const TRAVERSAL; + const Scoring *const SCORING; + const Weighting *const WEIGHTING; +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_IMPL_H diff --git a/app/src/main/jni/src/suggest/core/suggest_interface.h b/app/src/main/jni/src/suggest/core/suggest_interface.h new file mode 100644 index 000000000..a05aa9c80 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/suggest_interface.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_INTERFACE_H +#define LATINIME_SUGGEST_INTERFACE_H + +#include "defines.h" + +namespace latinime { + +class ProximityInfo; +class SuggestionResults; + +class SuggestInterface { + public: + virtual void getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, + int *inputYs, int *times, int *pointerIds, int *inputCodePoints, int inputSize, + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const suggestionResults) const = 0; + SuggestInterface() {} + virtual ~SuggestInterface() {} + private: + DISALLOW_COPY_AND_ASSIGN(SuggestInterface); +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_INTERFACE_H diff --git a/app/src/main/jni/src/suggest/core/suggest_options.h b/app/src/main/jni/src/suggest/core/suggest_options.h new file mode 100644 index 000000000..befb216d3 --- /dev/null +++ b/app/src/main/jni/src/suggest/core/suggest_options.h @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SUGGEST_OPTIONS_H +#define LATINIME_SUGGEST_OPTIONS_H + +#include "defines.h" + +namespace latinime { + +class SuggestOptions{ + public: + SuggestOptions(const int *const options, const int length) + : mOptions(options), mLength(length) {} + + AK_FORCE_INLINE bool isGesture() const { + return getBoolOption(IS_GESTURE); + } + + AK_FORCE_INLINE bool useFullEditDistance() const { + return getBoolOption(USE_FULL_EDIT_DISTANCE); + } + + AK_FORCE_INLINE bool blockOffensiveWords() const { + return getBoolOption(BLOCK_OFFENSIVE_WORDS); + } + + AK_FORCE_INLINE bool enableSpaceAwareGesture() const { + return getBoolOption(SPACE_AWARE_GESTURE_ENABLED); + } + + AK_FORCE_INLINE float weightForLocale() const { + // The weight is in thousands and we want the real value, so we divide by 1000. + // NativeSuggestOptions#setWeightForLocale does the opposite processing in Java. + return static_cast(getIntOption(WEIGHT_FOR_LOCALE_IN_THOUSANDS)) / 1000.0f; + } + + AK_FORCE_INLINE bool getAdditionalFeaturesBoolOption(const int key) const { + return getBoolOption(key + ADDITIONAL_FEATURES_OPTIONS); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestOptions); + + // Need to update be.scri.latin.NativeSuggestOptions when you add, remove or + // reorder options. + static const int IS_GESTURE = 0; + static const int USE_FULL_EDIT_DISTANCE = 1; + static const int BLOCK_OFFENSIVE_WORDS = 2; + static const int SPACE_AWARE_GESTURE_ENABLED = 3; + static const int WEIGHT_FOR_LOCALE_IN_THOUSANDS = 4; + // Additional features options are stored after the other options and used as setting values of + // experimental features. + static const int ADDITIONAL_FEATURES_OPTIONS = 5; + + const int *const mOptions; + const int mLength; + + AK_FORCE_INLINE bool isValidKey(const int key) const { + return 0 <= key && key < mLength; + } + + AK_FORCE_INLINE bool getBoolOption(const int key) const { + if (isValidKey(key)) { + return mOptions[key] != 0; + } + return false; + } + + AK_FORCE_INLINE int getIntOption(const int key) const { + if (isValidKey(key)) { + return mOptions[key]; + } + return 0; + } +}; +} // namespace latinime +#endif // LATINIME_SUGGEST_OPTIONS_H diff --git a/app/src/main/jni/src/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp b/app/src/main/jni/src/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp new file mode 100644 index 000000000..6d3173937 --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gesture_suggest_policy_factory.h" + +namespace latinime { + const SuggestPolicy *(*GestureSuggestPolicyFactory::sGestureSuggestFactoryMethod)() = 0; +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/policyimpl/gesture/gesture_suggest_policy_factory.h b/app/src/main/jni/src/suggest/policyimpl/gesture/gesture_suggest_policy_factory.h new file mode 100644 index 000000000..509b01fc0 --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/gesture/gesture_suggest_policy_factory.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_GESTURE_SUGGEST_POLICY_FACTORY_H +#define LATINIME_GESTURE_SUGGEST_POLICY_FACTORY_H + +#include "defines.h" + +namespace latinime { + +class SuggestPolicy; + +class GestureSuggestPolicyFactory { + public: + static void setGestureSuggestPolicyFactoryMethod(const SuggestPolicy *(*factoryMethod)()) { + sGestureSuggestFactoryMethod = factoryMethod; + } + + static const SuggestPolicy *getGestureSuggestPolicy() { + if (!sGestureSuggestFactoryMethod) { + return 0; + } + return sGestureSuggestFactoryMethod(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(GestureSuggestPolicyFactory); + static const SuggestPolicy *(*sGestureSuggestFactoryMethod)(); +}; +} // namespace latinime +#endif // LATINIME_GESTURE_SUGGEST_POLICY_FACTORY_H diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/app/src/main/jni/src/suggest/policyimpl/typing/scoring_params.cpp new file mode 100644 index 000000000..856808a74 --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/scoring_params.h" + +namespace latinime { +// TODO: RENAME all +const float ScoringParams::MAX_SPATIAL_DISTANCE = 1.0f; +const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY = 40; +const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120; +const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f; + +const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f; +const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f; +const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f; +const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f; +const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f; + +// TODO: Unlimit max cache dic node size +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE = 50; +const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4; + +const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f; +const float ScoringParams::PROXIMITY_COST = 0.0694f; +const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.072f; +const float ScoringParams::FIRST_PROXIMITY_COST = 0.07788f; +const float ScoringParams::INTENTIONAL_OMISSION_COST = 0.1f; +const float ScoringParams::OMISSION_COST = 0.467f; +const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.345f; +const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.5256f; +const float ScoringParams::INSERTION_COST = 0.7248f; +const float ScoringParams::TERMINAL_INSERTION_COST = 0.8128f; +const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f; +const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f; +const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f; +const float ScoringParams::TRANSPOSITION_COST = 0.5608f; +const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.33f; +const float ScoringParams::SPACE_OMISSION_COST = 0.1f; +const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.37972f; +const float ScoringParams::SUBSTITUTION_COST = 0.3806f; +const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f; +const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f; +const float ScoringParams::COST_FIRST_COMPLETION = 0.4836f; +const float ScoringParams::COST_COMPLETION = 0.00624f; +const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.0683f; +const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f; +const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.3482f; +const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f; +const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f; +const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.095f; +const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION = 0.99f; +const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION = 0.99f; +const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE = 0.99f; +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/scoring_params.h b/app/src/main/jni/src/suggest/policyimpl/typing/scoring_params.h new file mode 100644 index 000000000..6f327a370 --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/scoring_params.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SCORING_PARAMS_H +#define LATINIME_SCORING_PARAMS_H + +#include "defines.h" + +namespace latinime { + +class ScoringParams { + public: + // Fixed model parameters + static const float MAX_SPATIAL_DISTANCE; + static const int THRESHOLD_NEXT_WORD_PROBABILITY; + static const int THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED; + static const float AUTOCORRECT_OUTPUT_THRESHOLD; + static const int MAX_CACHE_DIC_NODE_SIZE; + static const int MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT; + static const int MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE; + static const int THRESHOLD_SHORT_WORD_LENGTH; + + static const float EXACT_MATCH_PROMOTION; + static const float PERFECT_MATCH_PROMOTION; + static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; + static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH; + + // Numerically optimized parameters (currently for tap typing only). + // TODO: add ability to modify these constants programmatically. + // TODO: explore optimization of gesture parameters. + static const float DISTANCE_WEIGHT_LENGTH; + static const float PROXIMITY_COST; + static const float FIRST_CHAR_PROXIMITY_COST; + static const float FIRST_PROXIMITY_COST; + static const float INTENTIONAL_OMISSION_COST; + static const float OMISSION_COST; + static const float OMISSION_COST_SAME_CHAR; + static const float OMISSION_COST_FIRST_CHAR; + static const float INSERTION_COST; + static const float TERMINAL_INSERTION_COST; + static const float INSERTION_COST_SAME_CHAR; + static const float INSERTION_COST_PROXIMITY_CHAR; + static const float INSERTION_COST_FIRST_CHAR; + static const float TRANSPOSITION_COST; + static const float SPACE_SUBSTITUTION_COST; + static const float SPACE_OMISSION_COST; + static const float ADDITIONAL_PROXIMITY_COST; + static const float SUBSTITUTION_COST; + static const float COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE; + static const float DISTANCE_WEIGHT_LANGUAGE; + static const float COST_FIRST_COMPLETION; + static const float COST_COMPLETION; + static const float HAS_PROXIMITY_TERMINAL_COST; + static const float HAS_EDIT_CORRECTION_TERMINAL_COST; + static const float HAS_MULTI_WORD_TERMINAL_COST; + static const float TYPING_BASE_OUTPUT_SCORE; + static const float TYPING_MAX_OUTPUT_SCORE_PER_INPUT; + static const float NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT; + static const float LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION; + static const float LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION; + static const float LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ScoringParams); +}; +} // namespace latinime +#endif // LATINIME_SCORING_PARAMS_H diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_scoring.cpp b/app/src/main/jni/src/suggest/policyimpl/typing/typing_scoring.cpp new file mode 100644 index 000000000..d8c6175e2 --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_scoring.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_scoring.h" + +namespace latinime { +const TypingScoring TypingScoring::sInstance; +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_scoring.h b/app/src/main/jni/src/suggest/policyimpl/typing/typing_scoring.h new file mode 100644 index 000000000..6acd767ea --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_scoring.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_SCORING_H +#define LATINIME_TYPING_SCORING_H + +#include "defines.h" +#include "suggest/core/dictionary/error_type_utils.h" +#include "suggest/core/policy/scoring.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/policyimpl/typing/scoring_params.h" + +namespace latinime { + +class DicNode; +class DicTraverseSession; + +class TypingScoring : public Scoring { + public: + static const TypingScoring *getInstance() { return &sInstance; } + + AK_FORCE_INLINE void getMostProbableString(const DicTraverseSession *const traverseSession, + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const {} + + AK_FORCE_INLINE float getAdjustedWeightOfLangModelVsSpatialModel( + DicTraverseSession *const traverseSession, DicNode *const terminals, + const int size) const { + return 1.0f; + } + + AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize, + const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, + const bool boostExactMatches, const bool hasProbabilityZero) const { + const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE + + static_cast(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT; + float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance; + if (forceCommit) { + score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD; + } + if (hasProbabilityZero) { + // Previously, when both legitimate 0-frequency words (such as distracters) and + // offensive words were encoded in the same way, distracters would never show up + // when the user blocked offensive words (the default setting, as well as the + // setting for regression tests). + // + // When b/11031090 was fixed and a separate encoding was used for offensive words, + // 0-frequency words would no longer be blocked when they were an "exact match" + // (where case mismatches and accent mismatches would be considered an "exact + // match"). The exact match boosting functionality meant that, for example, when + // the user typed "mt" they would be suggested the word "Mt", although they most + // probably meant to type "my". + // + // For this reason, we introduced this change, which does the following: + // * Defines the "perfect match" as a really exact match, with no room for case or + // accent mismatches + // * When the target word has probability zero (as "Mt" does, because it is a + // distracter), ONLY boost its score if it is a perfect match. + // + // By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and + // they will get "my". However, if the user makes an explicit effort to type "Mt", + // we do boost the word "Mt" so that the user's input is not autocorrected to "My". + if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) { + score += ScoringParams::PERFECT_MATCH_PROMOTION; + } + } else { + if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { + score += ScoringParams::EXACT_MATCH_PROMOTION; + if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) { + score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) { + score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { + score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } + } + } + return static_cast(score * SUGGEST_INTERFACE_OUTPUT_SCALE); + } + + AK_FORCE_INLINE float getDoubleLetterDemotionDistanceCost( + const DicNode *const terminalDicNode) const { + return 0.0f; + } + + AK_FORCE_INLINE bool autoCorrectsToMultiWordSuggestionIfTop() const { + return true; + } + + AK_FORCE_INLINE bool sameAsTyped(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + return traverseSession->getProximityInfoState(0)->sameAsTyped( + dicNode->getOutputWordBuf(), dicNode->getNodeCodePointCount()); + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingScoring); + static const TypingScoring sInstance; + + TypingScoring() {} + ~TypingScoring() {} +}; +} // namespace latinime +#endif // LATINIME_TYPING_SCORING_H diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy.cpp b/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy.cpp new file mode 100644 index 000000000..0c2763967 --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_suggest_policy.h" + +namespace latinime { +const TypingSuggestPolicy TypingSuggestPolicy::sInstance; +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy.h b/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy.h new file mode 100644 index 000000000..35f48097c --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_SUGGEST_POLICY_H +#define LATINIME_TYPING_SUGGEST_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/suggest_policy.h" +#include "suggest/policyimpl/typing/typing_scoring.h" +#include "suggest/policyimpl/typing/typing_traversal.h" +#include "suggest/policyimpl/typing/typing_weighting.h" + +namespace latinime { + +class Scoring; +class Traversal; +class Weighting; + +class TypingSuggestPolicy : public SuggestPolicy { + public: + static const TypingSuggestPolicy *getInstance() { return &sInstance; } + + TypingSuggestPolicy() {} + virtual ~TypingSuggestPolicy() {} + AK_FORCE_INLINE const Traversal *getTraversal() const { + return TypingTraversal::getInstance(); + } + + AK_FORCE_INLINE const Scoring *getScoring() const { + return TypingScoring::getInstance(); + } + + AK_FORCE_INLINE const Weighting *getWeighting() const { + return TypingWeighting::getInstance(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingSuggestPolicy); + static const TypingSuggestPolicy sInstance; +}; +} // namespace latinime +#endif // LATINIME_TYPING_SUGGEST_POLICY_H diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy_factory.h b/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy_factory.h new file mode 100644 index 000000000..a67b45b1b --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_suggest_policy_factory.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_SUGGEST_POLICY_FACTORY_H +#define LATINIME_TYPING_SUGGEST_POLICY_FACTORY_H + +#include "defines.h" +#include "typing_suggest_policy.h" + +namespace latinime { + +class SuggestPolicy; + +class TypingSuggestPolicyFactory { + public: + static const SuggestPolicy *getTypingSuggestPolicy() { + return TypingSuggestPolicy::getInstance(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingSuggestPolicyFactory); +}; +} // namespace latinime +#endif // LATINIME_TYPING_SUGGEST_POLICY_FACTORY_H diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_traversal.cpp b/app/src/main/jni/src/suggest/policyimpl/typing/typing_traversal.cpp new file mode 100644 index 000000000..e7e40e34d --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_traversal.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_traversal.h" + +namespace latinime { +const bool TypingTraversal::CORRECT_OMISSION = true; +const bool TypingTraversal::CORRECT_NEW_WORD_SPACE_SUBSTITUTION = true; +const bool TypingTraversal::CORRECT_NEW_WORD_SPACE_OMISSION = true; +const TypingTraversal TypingTraversal::sInstance; +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_traversal.h b/app/src/main/jni/src/suggest/policyimpl/typing/typing_traversal.h new file mode 100644 index 000000000..b9b6314ae --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_traversal.h @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_TRAVERSAL_H +#define LATINIME_TYPING_TRAVERSAL_H + +#include + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/layout/proximity_info_state.h" +#include "suggest/core/layout/proximity_info_utils.h" +#include "suggest/core/policy/traversal.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest_options.h" +#include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" + +namespace latinime { +class TypingTraversal : public Traversal { + public: + static const TypingTraversal *getInstance() { return &sInstance; } + + AK_FORCE_INLINE int getMaxPointerCount() const { + return MAX_POINTER_COUNT; + } + + AK_FORCE_INLINE bool allowsErrorCorrections(const DicNode *const dicNode) const { + return dicNode->getNormalizedSpatialDistance() + < ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT; + } + + AK_FORCE_INLINE bool isOmission(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const DicNode *const childDicNode, + const bool allowsErrorCorrections) const { + if (!CORRECT_OMISSION) { + return false; + } + // Note: Always consider intentional omissions (like apostrophes) since they are common. + const bool canConsiderOmission = + allowsErrorCorrections || childDicNode->canBeIntentionalOmission(); + if (!canConsiderOmission) { + return false; + } + const int inputSize = traverseSession->getInputSize(); + // TODO: Don't refer to isCompletion? + if (dicNode->isCompletion(inputSize)) { + return false; + } + if (dicNode->canBeIntentionalOmission()) { + return true; + } + const int point0Index = dicNode->getInputIndex(0); + const int currentBaseLowerCodePoint = + CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint()); + const int typedBaseLowerCodePoint = + CharUtils::toBaseLowerCase(traverseSession->getProximityInfoState(0) + ->getPrimaryCodePointAt(point0Index)); + return (currentBaseLowerCodePoint != typedBaseLowerCodePoint); + } + + AK_FORCE_INLINE bool isSpaceSubstitutionTerminal( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + if (!CORRECT_NEW_WORD_SPACE_SUBSTITUTION) { + return false; + } + if (traverseSession->getSuggestOptions()->weightForLocale() + < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION) { + // Space substitution is heavy, so we skip doing it if the weight for this language + // is low because we anticipate the suggestions out of this dictionary are not for + // the language the user intends to type in. + return false; + } + if (!canDoLookAheadCorrection(traverseSession, dicNode)) { + return false; + } + const int point0Index = dicNode->getInputIndex(0); + return dicNode->isTerminalDicNode() + && traverseSession->getProximityInfoState(0)-> + hasSpaceProximity(point0Index); + } + + AK_FORCE_INLINE bool isSpaceOmissionTerminal( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + if (!CORRECT_NEW_WORD_SPACE_OMISSION) { + return false; + } + if (traverseSession->getSuggestOptions()->weightForLocale() + < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION) { + // Space omission is heavy, so we skip doing it if the weight for this language + // is low because we anticipate the suggestions out of this dictionary are not for + // the language the user intends to type in. + return false; + } + const int inputSize = traverseSession->getInputSize(); + // TODO: Don't refer to isCompletion? + if (dicNode->isCompletion(inputSize)) { + return false; + } + if (!dicNode->isTerminalDicNode()) { + return false; + } + const int16_t pointIndex = dicNode->getInputIndex(0); + return pointIndex <= inputSize && !dicNode->isTotalInputSizeExceedingLimit() + && !dicNode->shouldBeFilteredBySafetyNetForBigram(); + } + + AK_FORCE_INLINE bool shouldDepthLevelCache( + const DicTraverseSession *const traverseSession) const { + const int inputSize = traverseSession->getInputSize(); + return traverseSession->isCacheBorderForTyping(inputSize); + } + + AK_FORCE_INLINE bool shouldNodeLevelCache( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + return false; + } + + AK_FORCE_INLINE bool canDoLookAheadCorrection( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { + const int inputSize = traverseSession->getInputSize(); + return dicNode->canDoLookAheadCorrection(inputSize); + } + + AK_FORCE_INLINE ProximityType getProximityType( + const DicTraverseSession *const traverseSession, const DicNode *const dicNode, + const DicNode *const childDicNode) const { + return traverseSession->getProximityInfoState(0)->getProximityType( + dicNode->getInputIndex(0), childDicNode->getNodeCodePoint(), + true /* checkProximityChars */); + } + + AK_FORCE_INLINE bool needsToTraverseAllUserInput() const { + return true; + } + + AK_FORCE_INLINE float getMaxSpatialDistance() const { + return ScoringParams::MAX_SPATIAL_DISTANCE; + } + + AK_FORCE_INLINE int getDefaultExpandDicNodeSize() const { + return DicNodeVector::DEFAULT_NODES_SIZE_FOR_OPTIMIZATION; + } + + AK_FORCE_INLINE int getMaxCacheSize(const int inputSize, const float weightForLocale) const { + if (inputSize <= 1) { + return ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT; + } + if (weightForLocale < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE) { + return ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE; + } + return ScoringParams::MAX_CACHE_DIC_NODE_SIZE; + } + + AK_FORCE_INLINE int getTerminalCacheSize() const { + return MAX_RESULTS; + } + + AK_FORCE_INLINE bool isPossibleOmissionChildNode( + const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, + const DicNode *const dicNode) const { + const ProximityType proximityType = + getProximityType(traverseSession, parentDicNode, dicNode); + if (!ProximityInfoUtils::isMatchOrProximityChar(proximityType)) { + return false; + } + return true; + } + + AK_FORCE_INLINE bool isGoodToTraverseNextWord(const DicNode *const dicNode, + const int probability) const { + if (probability < ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY) { + return false; + } + const bool shortCappedWord = dicNode->getNodeCodePointCount() + < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && dicNode->isFirstCharUppercase(); + return !shortCappedWord + || probability >= ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED; + } + + private: + DISALLOW_COPY_AND_ASSIGN(TypingTraversal); + static const bool CORRECT_OMISSION; + static const bool CORRECT_NEW_WORD_SPACE_SUBSTITUTION; + static const bool CORRECT_NEW_WORD_SPACE_OMISSION; + static const TypingTraversal sInstance; + + TypingTraversal() {} + ~TypingTraversal() {} +}; +} // namespace latinime +#endif // LATINIME_TYPING_TRAVERSAL_H diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_weighting.cpp b/app/src/main/jni/src/suggest/policyimpl/typing/typing_weighting.cpp new file mode 100644 index 000000000..a0e54115d --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_weighting.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/typing/typing_weighting.h" + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/layout/proximity_info.h" +#include "suggest/policyimpl/typing/scoring_params.h" + +namespace latinime { + +const TypingWeighting TypingWeighting::sInstance; + +ErrorTypeUtils::ErrorType TypingWeighting::getErrorType(const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, + const DicNode *const dicNode) const { + switch (correctionType) { + case CT_MATCH: + if (isProximityDicNode(traverseSession, dicNode)) { + return ErrorTypeUtils::PROXIMITY_CORRECTION; + } else if (dicNode->isInDigraph()) { + return ErrorTypeUtils::MATCH_WITH_DIGRAPH; + } else { + // Compare the node code point with original primary code point on the keyboard. + const ProximityInfoState *const pInfoState = + traverseSession->getProximityInfoState(0); + const int primaryCodePoint = pInfoState->getPrimaryCodePointAt( + dicNode->getInputIndex(0)); + const int nodeCodePoint = dicNode->getNodeCodePoint(); + const int keyIndex = traverseSession->getProximityInfo()->getKeyIndexOf( + primaryCodePoint); + // TODO: Check whether the input code point is on the keyboard. + if (primaryCodePoint == nodeCodePoint) { + // Node code point is same as original code point on the keyboard. + return ErrorTypeUtils::NOT_AN_ERROR; + } else if (CharUtils::toLowerCase(primaryCodePoint) == + CharUtils::toLowerCase(nodeCodePoint)) { + // Only cases of the code points are different. + return ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } else if (primaryCodePoint == CharUtils::toBaseCodePoint(nodeCodePoint)) { + // Node code point is a variant of original code point. + return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT; + } else if (CharUtils::toBaseCodePoint(primaryCodePoint) + == CharUtils::toBaseCodePoint(nodeCodePoint)) { + // Base code points are the same but the code point is intentionally input. + if (keyIndex == NOT_AN_INDEX) { + return ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT; + } + return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT; + } else if (CharUtils::toLowerCase(primaryCodePoint) + == CharUtils::toBaseLowerCase(nodeCodePoint)) { + // Node code point is a variant of original code point and the cases are also + // different. + return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } else { + if (keyIndex == NOT_AN_INDEX) { + return ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } + // Base code points are the same and the cases are different. + return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } + } + break; + case CT_ADDITIONAL_PROXIMITY: + // TODO: Change to EDIT_CORRECTION. + return ErrorTypeUtils::PROXIMITY_CORRECTION; + case CT_OMISSION: + if (parentDicNode->canBeIntentionalOmission()) { + return ErrorTypeUtils::INTENTIONAL_OMISSION; + } else { + return ErrorTypeUtils::EDIT_CORRECTION; + } + break; + case CT_SUBSTITUTION: + // TODO: Quit settng PROXIMITY_CORRECTION. + return ErrorTypeUtils::EDIT_CORRECTION | ErrorTypeUtils::PROXIMITY_CORRECTION; + case CT_INSERTION: + case CT_TERMINAL_INSERTION: + case CT_TRANSPOSITION: + return ErrorTypeUtils::EDIT_CORRECTION; + case CT_NEW_WORD_SPACE_OMISSION: + case CT_NEW_WORD_SPACE_SUBSTITUTION: + return ErrorTypeUtils::NEW_WORD; + case CT_TERMINAL: + return ErrorTypeUtils::NOT_AN_ERROR; + case CT_COMPLETION: + return ErrorTypeUtils::COMPLETION; + default: + return ErrorTypeUtils::NOT_AN_ERROR; + } +} +} // namespace latinime diff --git a/app/src/main/jni/src/suggest/policyimpl/typing/typing_weighting.h b/app/src/main/jni/src/suggest/policyimpl/typing/typing_weighting.h new file mode 100644 index 000000000..1338ac81a --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -0,0 +1,225 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TYPING_WEIGHTING_H +#define LATINIME_TYPING_WEIGHTING_H + +#include "defines.h" +#include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/dictionary/error_type_utils.h" +#include "suggest/core/layout/touch_position_correction_utils.h" +#include "suggest/core/policy/weighting.h" +#include "suggest/core/session/dic_traverse_session.h" +#include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" + +namespace latinime { + +class DicNode; +struct DicNode_InputStateG; +class MultiBigramMap; + +class TypingWeighting : public Weighting { + public: + static const TypingWeighting *getInstance() { return &sInstance; } + + protected: + float getTerminalSpatialCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + float cost = 0.0f; + if (dicNode->hasMultipleWords()) { + cost += ScoringParams::HAS_MULTI_WORD_TERMINAL_COST; + } + if (dicNode->getProximityCorrectionCount() > 0) { + cost += ScoringParams::HAS_PROXIMITY_TERMINAL_COST; + } + if (dicNode->getEditCorrectionCount() > 0) { + cost += ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST; + } + return cost; + } + + float getOmissionCost(const DicNode *const parentDicNode, const DicNode *const dicNode) const { + const bool isZeroCostOmission = parentDicNode->isZeroCostOmission(); + const bool isIntentionalOmission = parentDicNode->canBeIntentionalOmission(); + const bool sameCodePoint = dicNode->isSameNodeCodePoint(parentDicNode); + // If the traversal omitted the first letter then the dicNode should now be on the second. + const bool isFirstLetterOmission = dicNode->getNodeCodePointCount() == 2; + float cost = 0.0f; + if (isZeroCostOmission) { + cost = 0.0f; + } else if (isIntentionalOmission) { + cost = ScoringParams::INTENTIONAL_OMISSION_COST; + } else if (isFirstLetterOmission) { + cost = ScoringParams::OMISSION_COST_FIRST_CHAR; + } else { + cost = sameCodePoint ? ScoringParams::OMISSION_COST_SAME_CHAR + : ScoringParams::OMISSION_COST; + } + return cost; + } + + float getMatchedCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, DicNode_InputStateG *inputStateG) const { + const int pointIndex = dicNode->getInputIndex(0); + const float normalizedSquaredLength = traverseSession->getProximityInfoState(0) + ->getPointToKeyLength(pointIndex, + CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); + const float normalizedDistance = TouchPositionCorrectionUtils::getSweetSpotFactor( + traverseSession->isTouchPositionCorrectionEnabled(), normalizedSquaredLength); + const float weightedDistance = ScoringParams::DISTANCE_WEIGHT_LENGTH * normalizedDistance; + + const bool isFirstChar = pointIndex == 0; + const bool isProximity = isProximityDicNode(traverseSession, dicNode); + float cost = isProximity ? (isFirstChar ? ScoringParams::FIRST_CHAR_PROXIMITY_COST + : ScoringParams::PROXIMITY_COST) : 0.0f; + if (isProximity && dicNode->getProximityCorrectionCount() == 0) { + cost += ScoringParams::FIRST_PROXIMITY_COST; + } + if (dicNode->getNodeCodePointCount() == 2) { + // At the second character of the current word, we check if the first char is uppercase + // and the word is a second or later word of a multiple word suggestion. We demote it + // if so. + const bool isSecondOrLaterWordFirstCharUppercase = + dicNode->hasMultipleWords() && dicNode->isFirstCharUppercase(); + if (isSecondOrLaterWordFirstCharUppercase) { + cost += ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE; + } + } + return weightedDistance + cost; + } + + bool isProximityDicNode(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + const int pointIndex = dicNode->getInputIndex(0); + const int primaryCodePoint = CharUtils::toBaseLowerCase( + traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex)); + const int dicNodeChar = CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint()); + return primaryCodePoint != dicNodeChar; + } + + float getTranspositionCost(const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const { + const int16_t parentPointIndex = parentDicNode->getInputIndex(0); + const int prevCodePoint = parentDicNode->getNodeCodePoint(); + const float distance1 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( + parentPointIndex + 1, CharUtils::toBaseLowerCase(prevCodePoint)); + const int codePoint = dicNode->getNodeCodePoint(); + const float distance2 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( + parentPointIndex, CharUtils::toBaseLowerCase(codePoint)); + const float distance = distance1 + distance2; + const float weightedLengthDistance = + distance * ScoringParams::DISTANCE_WEIGHT_LENGTH; + return ScoringParams::TRANSPOSITION_COST + weightedLengthDistance; + } + + float getInsertionCost(const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const { + const int16_t insertedPointIndex = parentDicNode->getInputIndex(0); + const int prevCodePoint = traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt( + insertedPointIndex); + const int currentCodePoint = dicNode->getNodeCodePoint(); + const bool sameCodePoint = prevCodePoint == currentCodePoint; + const bool existsAdjacentProximityChars = traverseSession->getProximityInfoState(0) + ->existsAdjacentProximityChars(insertedPointIndex); + const float dist = traverseSession->getProximityInfoState(0)->getPointToKeyLength( + insertedPointIndex + 1, CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); + const float weightedDistance = dist * ScoringParams::DISTANCE_WEIGHT_LENGTH; + const bool singleChar = dicNode->getNodeCodePointCount() == 1; + float cost = (singleChar ? ScoringParams::INSERTION_COST_FIRST_CHAR : 0.0f); + if (sameCodePoint) { + cost += ScoringParams::INSERTION_COST_SAME_CHAR; + } else if (existsAdjacentProximityChars) { + cost += ScoringParams::INSERTION_COST_PROXIMITY_CHAR; + } else { + cost += ScoringParams::INSERTION_COST; + } + return cost + weightedDistance; + } + + float getSpaceOmissionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, DicNode_InputStateG *inputStateG) const { + const float cost = ScoringParams::SPACE_OMISSION_COST; + return cost * traverseSession->getMultiWordCostMultiplier(); + } + + float getNewWordBigramLanguageCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, + MultiBigramMap *const multiBigramMap) const { + return DicNodeUtils::getBigramNodeImprobability( + traverseSession->getDictionaryStructurePolicy(), + dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; + } + + float getCompletionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + // The auto completion starts when the input index is same as the input size + const bool firstCompletion = dicNode->getInputIndex(0) + == traverseSession->getInputSize(); + // TODO: Change the cost for the first completion for the gesture? + const float cost = firstCompletion ? ScoringParams::COST_FIRST_COMPLETION + : ScoringParams::COST_COMPLETION; + return cost; + } + + float getTerminalLanguageCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, const float dicNodeLanguageImprobability) const { + return dicNodeLanguageImprobability * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; + } + + float getTerminalInsertionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + const int inputIndex = dicNode->getInputIndex(0); + const int inputSize = traverseSession->getInputSize(); + ASSERT(inputIndex < inputSize); + // TODO: Implement more efficient logic + return ScoringParams::TERMINAL_INSERTION_COST * (inputSize - inputIndex); + } + + AK_FORCE_INLINE bool needsToNormalizeCompoundDistance() const { + return false; + } + + AK_FORCE_INLINE float getAdditionalProximityCost() const { + return ScoringParams::ADDITIONAL_PROXIMITY_COST; + } + + AK_FORCE_INLINE float getSubstitutionCost() const { + return ScoringParams::SUBSTITUTION_COST; + } + + AK_FORCE_INLINE float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + const int inputIndex = dicNode->getInputIndex(0); + const float distanceToSpaceKey = traverseSession->getProximityInfoState(0) + ->getPointToKeyLength(inputIndex, KEYCODE_SPACE); + const float cost = ScoringParams::SPACE_SUBSTITUTION_COST * distanceToSpaceKey; + return cost * traverseSession->getMultiWordCostMultiplier(); + } + + ErrorTypeUtils::ErrorType getErrorType(const CorrectionType correctionType, + const DicTraverseSession *const traverseSession, + const DicNode *const parentDicNode, const DicNode *const dicNode) const; + + private: + DISALLOW_COPY_AND_ASSIGN(TypingWeighting); + static const TypingWeighting sInstance; + + TypingWeighting() {} + ~TypingWeighting() {} +}; +} // namespace latinime +#endif // LATINIME_TYPING_WEIGHTING_H diff --git a/app/src/main/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h b/app/src/main/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h new file mode 100644 index 000000000..81614bc9c --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H +#define LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H + +#include "suggest/policyimpl/utils/edit_distance_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy { + public: + DamerauLevenshteinEditDistancePolicy(const int *const string0, const int length0, + const int *const string1, const int length1) + : mString0(string0), mString0Length(length0), mString1(string1), + mString1Length(length1) {} + ~DamerauLevenshteinEditDistancePolicy() {} + + AK_FORCE_INLINE float getSubstitutionCost(const int index0, const int index1) const { + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); + return (c0 == c1) ? 0.0f : 1.0f; + } + + AK_FORCE_INLINE float getDeletionCost(const int index0, const int index1) const { + return 1.0f; + } + + AK_FORCE_INLINE float getInsertionCost(const int index0, const int index1) const { + return 1.0f; + } + + AK_FORCE_INLINE bool allowTransposition(const int index0, const int index1) const { + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); + if (index0 > 0 && index1 > 0 && c0 == CharUtils::toBaseLowerCase(mString1[index1 - 1]) + && c1 == CharUtils::toBaseLowerCase(mString0[index0 - 1])) { + return true; + } + return false; + } + + AK_FORCE_INLINE float getTranspositionCost(const int index0, const int index1) const { + return getSubstitutionCost(index0, index1); + } + + AK_FORCE_INLINE int getString0Length() const { + return mString0Length; + } + + AK_FORCE_INLINE int getString1Length() const { + return mString1Length; + } + + private: + DISALLOW_COPY_AND_ASSIGN (DamerauLevenshteinEditDistancePolicy); + + const int *const mString0; + const int mString0Length; + const int *const mString1; + const int mString1Length; +}; +} // namespace latinime + +#endif // LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H diff --git a/app/src/main/jni/src/suggest/policyimpl/utils/edit_distance.h b/app/src/main/jni/src/suggest/policyimpl/utils/edit_distance.h new file mode 100644 index 000000000..4cfd0b3f3 --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/utils/edit_distance.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_EDIT_DISTANCE_H +#define LATINIME_EDIT_DISTANCE_H + +#include + +#include "defines.h" +#include "suggest/policyimpl/utils/edit_distance_policy.h" + +namespace latinime { + +class EditDistance { + public: + // CAVEAT: There may be performance penalty if you need the edit distance as an integer value. + AK_FORCE_INLINE static float getEditDistance(const EditDistancePolicy *const policy) { + const int beforeLength = policy->getString0Length(); + const int afterLength = policy->getString1Length(); + float dp[(beforeLength + 1) * (afterLength + 1)]; + for (int i = 0; i <= beforeLength; ++i) { + dp[(afterLength + 1) * i] = i * policy->getInsertionCost(i - 1, -1); + } + for (int i = 0; i <= afterLength; ++i) { + dp[i] = i * policy->getDeletionCost(-1, i - 1); + } + + for (int i = 0; i < beforeLength; ++i) { + for (int j = 0; j < afterLength; ++j) { + dp[(afterLength + 1) * (i + 1) + (j + 1)] = std::min( + dp[(afterLength + 1) * i + (j + 1)] + policy->getInsertionCost(i, j), + std::min( + dp[(afterLength + 1) * (i + 1) + j] + policy->getDeletionCost(i, j), + dp[(afterLength + 1) * i + j] + policy->getSubstitutionCost(i, j))); + if (policy->allowTransposition(i, j)) { + dp[(afterLength + 1) * (i + 1) + (j + 1)] = std::min( + dp[(afterLength + 1) * (i + 1) + (j + 1)], + dp[(afterLength + 1) * (i - 1) + (j - 1)] + + policy->getTranspositionCost(i, j)); + } + } + } + if (DEBUG_EDIT_DISTANCE) { + AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength); + for (int i = 0; i < beforeLength + 1; ++i) { + for (int j = 0; j < afterLength + 1; ++j) { + AKLOGI("EDIT[%d][%d], %f", i, j, dp[(afterLength + 1) * i + j]); + } + } + } + return dp[(beforeLength + 1) * (afterLength + 1) - 1]; + } + + AK_FORCE_INLINE static void dumpEditDistance10ForDebug(const float *const editDistanceTable, + const int editDistanceTableWidth, const int outputLength) { + if (DEBUG_DICT) { + AKLOGI("EditDistanceTable"); + for (int i = 0; i <= 10; ++i) { + float c[11]; + for (int j = 0; j <= 10; ++j) { + if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { + c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; + } else { + c[j] = -1.0f; + } + } + AKLOGI("[ %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f ]", + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); + (void)c; // To suppress compiler warning + } + } + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(EditDistance); +}; +} // namespace latinime + +#endif // LATINIME_EDIT_DISTANCE_H diff --git a/app/src/main/jni/src/suggest/policyimpl/utils/edit_distance_policy.h b/app/src/main/jni/src/suggest/policyimpl/utils/edit_distance_policy.h new file mode 100644 index 000000000..e3d1792cb --- /dev/null +++ b/app/src/main/jni/src/suggest/policyimpl/utils/edit_distance_policy.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_EDIT_DISTANCE_POLICY_H +#define LATINIME_EDIT_DISTANCE_POLICY_H + +#include "defines.h" + +namespace latinime { + +class EditDistancePolicy { + public: + virtual float getSubstitutionCost(const int index0, const int index1) const = 0; + virtual float getDeletionCost(const int index0, const int index1) const = 0; + virtual float getInsertionCost(const int index0, const int index1) const = 0; + virtual bool allowTransposition(const int index0, const int index1) const = 0; + virtual float getTranspositionCost(const int index0, const int index1) const = 0; + virtual int getString0Length() const = 0; + virtual int getString1Length() const = 0; + + protected: + EditDistancePolicy() {} + virtual ~EditDistancePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(EditDistancePolicy); +}; +} // namespace latinime + +#endif // LATINIME_EDIT_DISTANCE_POLICY_H diff --git a/app/src/main/jni/src/utils/autocorrection_threshold_utils.cpp b/app/src/main/jni/src/utils/autocorrection_threshold_utils.cpp new file mode 100644 index 000000000..349786a27 --- /dev/null +++ b/app/src/main/jni/src/utils/autocorrection_threshold_utils.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/autocorrection_threshold_utils.h" + +#include +#include + +#include "defines.h" +#include "suggest/policyimpl/utils/edit_distance.h" +#include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h" + +namespace latinime { + +const int AutocorrectionThresholdUtils::MAX_INITIAL_SCORE = 255; +const int AutocorrectionThresholdUtils::TYPED_LETTER_MULTIPLIER = 2; +const int AutocorrectionThresholdUtils::FULL_WORD_MULTIPLIER = 2; + +/* static */ int AutocorrectionThresholdUtils::editDistance(const int *before, + const int beforeLength, const int *after, const int afterLength) { + const DamerauLevenshteinEditDistancePolicy daemaruLevenshtein( + before, beforeLength, after, afterLength); + return static_cast(EditDistance::getEditDistance(&daemaruLevenshtein)); +} + +// In dictionary.cpp, getSuggestion() method, +// When USE_SUGGEST_INTERFACE_FOR_TYPING is true: +// +// // TODO: Revise the following logic thoroughly by referring to the logic +// // marked as "Otherwise" below. +// SUGGEST_INTERFACE_OUTPUT_SCALE was multiplied to the original suggestion scores to convert +// them to integers. +// score = (int)((original score) * SUGGEST_INTERFACE_OUTPUT_SCALE) +// Undo the scaling here to recover the original score. +// normalizedScore = ((float)score) / SUGGEST_INTERFACE_OUTPUT_SCALE +// +// Otherwise: suggestion scores are computed using the below formula. +// original score +// := powf(mTypedLetterMultiplier (this is defined 2), +// (the number of matched characters between typed word and suggested word)) +// * (individual word's score which defined in the unigram dictionary, +// and this score is defined in range [0, 255].) +// Then, the following processing is applied. +// - If the dictionary word is matched up to the point of the user entry +// (full match up to min(before.length(), after.length()) +// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) +// - If the word is a true full match except for differences in accents or +// capitalization, then treat it as if the score was 255. +// - If before.length() == after.length() +// => multiply by mFullWordMultiplier (this is defined 2)) +// So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2 +// For historical reasons we ignore the 1.2 modifier (because the measure for a good +// autocorrection threshold was done at a time when it didn't exist). This doesn't change +// the result. +// So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2. + +/* static */ float AutocorrectionThresholdUtils::calcNormalizedScore(const int *before, + const int beforeLength, const int *after, const int afterLength, const int score) { + if (0 == beforeLength || 0 == afterLength) { + return 0.0f; + } + const int distance = editDistance(before, beforeLength, after, afterLength); + int spaceCount = 0; + for (int i = 0; i < afterLength; ++i) { + if (after[i] == KEYCODE_SPACE) { + ++spaceCount; + } + } + + if (spaceCount == afterLength) { + return 0.0f; + } + + if (score <= 0 || distance >= afterLength) { + // normalizedScore must be 0.0f (the minimum value) if the score is less than or equal to 0, + // or if the edit distance is larger than or equal to afterLength. + return 0.0f; + } + // add a weight based on edit distance. + const float weight = 1.0f - static_cast(distance) / static_cast(afterLength); + + // TODO: Revise the following logic thoroughly by referring to... + if (true /* USE_SUGGEST_INTERFACE_FOR_TYPING */) { + return (static_cast(score) / SUGGEST_INTERFACE_OUTPUT_SCALE) * weight; + } + // ...this logic. + const float maxScore = score >= S_INT_MAX ? static_cast(S_INT_MAX) + : static_cast(MAX_INITIAL_SCORE) + * powf(static_cast(TYPED_LETTER_MULTIPLIER), + static_cast(std::min(beforeLength, afterLength - spaceCount))) + * static_cast(FULL_WORD_MULTIPLIER); + + return (static_cast(score) / maxScore) * weight; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/utils/autocorrection_threshold_utils.h b/app/src/main/jni/src/utils/autocorrection_threshold_utils.h new file mode 100644 index 000000000..c7537a6a5 --- /dev/null +++ b/app/src/main/jni/src/utils/autocorrection_threshold_utils.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_AUTOCORRECTION_THRESHOLD_UTILS_H +#define LATINIME_AUTOCORRECTION_THRESHOLD_UTILS_H + +#include "defines.h" + +namespace latinime { + +class AutocorrectionThresholdUtils { + public: + static float calcNormalizedScore(const int *before, const int beforeLength, + const int *after, const int afterLength, const int score); + static int editDistance(const int *before, const int beforeLength, const int *after, + const int afterLength); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(AutocorrectionThresholdUtils); + + static const int MAX_INITIAL_SCORE; + static const int TYPED_LETTER_MULTIPLIER; + static const int FULL_WORD_MULTIPLIER; +}; +} // namespace latinime +#endif // LATINIME_AUTOCORRECTION_THRESHOLD_UTILS_H diff --git a/app/src/main/jni/src/utils/byte_array_view.h b/app/src/main/jni/src/utils/byte_array_view.h new file mode 100644 index 000000000..2b778af6f --- /dev/null +++ b/app/src/main/jni/src/utils/byte_array_view.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_VIEW_H +#define LATINIME_BYTE_ARRAY_VIEW_H + +#include +#include + +#include "defines.h" + +namespace latinime { + +/** + * Helper class used to keep track of read accesses for a given memory region. + */ +class ReadOnlyByteArrayView { + public: + ReadOnlyByteArrayView() : mPtr(nullptr), mSize(0) {} + + ReadOnlyByteArrayView(const uint8_t *const ptr, const size_t size) + : mPtr(ptr), mSize(size) {} + + AK_FORCE_INLINE size_t size() const { + return mSize; + } + + AK_FORCE_INLINE const uint8_t *data() const { + return mPtr; + } + + AK_FORCE_INLINE const ReadOnlyByteArrayView skip(const size_t n) const { + if (mSize <= n) { + return ReadOnlyByteArrayView(); + } + return ReadOnlyByteArrayView(mPtr + n, mSize - n); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(ReadOnlyByteArrayView); + + const uint8_t *const mPtr; + const size_t mSize; +}; + +/** + * Helper class used to keep track of read-write accesses for a given memory region. + */ +class ReadWriteByteArrayView { + public: + ReadWriteByteArrayView() : mPtr(nullptr), mSize(0) {} + + ReadWriteByteArrayView(uint8_t *const ptr, const size_t size) + : mPtr(ptr), mSize(size) {} + + AK_FORCE_INLINE size_t size() const { + return mSize; + } + + AK_FORCE_INLINE uint8_t *data() const { + return mPtr; + } + + AK_FORCE_INLINE ReadOnlyByteArrayView getReadOnlyView() const { + return ReadOnlyByteArrayView(mPtr, mSize); + } + + ReadWriteByteArrayView subView(const size_t start, const size_t n) const { + ASSERT(start + n <= mSize); + return ReadWriteByteArrayView(mPtr + start, n); + } + + private: + // Default copy constructor and assignment operator are used for using this class with STL + // containers. + + // These members cannot be const to have the assignment operator. + uint8_t *mPtr; + size_t mSize; +}; + +} // namespace latinime +#endif // LATINIME_BYTE_ARRAY_VIEW_H diff --git a/app/src/main/jni/src/utils/char_utils.cpp b/app/src/main/jni/src/utils/char_utils.cpp new file mode 100644 index 000000000..a43e6dd62 --- /dev/null +++ b/app/src/main/jni/src/utils/char_utils.cpp @@ -0,0 +1,1287 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/char_utils.h" + +#include + +#include "defines.h" + +namespace latinime { + +const int CharUtils::MIN_UNICODE_CODE_POINT = 0; +const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF; + +struct LatinCapitalSmallPair { + unsigned short capital; + unsigned short small; +}; + +/* + * How to update the SORTED_CHAR_MAP[] array. + * + * 1. Download http://unicode.org/Public/UNIDATA/UnicodeData.txt + * + * 2. Have a latest version of ICU4C dev package installed + * (Note: the current data has been generated with version 4.8) + * $ apt-get install libicu-dev + * + * 3. Build the following code + * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc + */ +#ifdef UPDATING_CHAR_UTILS +#include +#include // ICU4C + +extern "C" int main() { + for (unsigned short c = 0; c < 0xFFFF; c++) { + if (c <= 0x7F) continue; + const unsigned short icu4cLowerC = u_tolower(c); + const unsigned short myLowerC = CharUtils::latin_tolower(c); + if (c != icu4cLowerC) { +#ifdef CONFIRMING_CHAR_UTILS + if (icu4cLowerC != myLowerC) { + fprintf(stderr, "icu4cLowerC != myLowerC, 0x%04X, 0x%04X\n", icu4cLowerC, myLowerC); + } +#else // CONFIRMING_CHAR_UTILS + printf("0x%04X, 0x%04X\n", c, icu4cLowerC); +#endif // CONFIRMING_CHAR_UTILS + } + } +} +#endif // UPDATING_CHAR_UTILS +/* + * 4. Process the list with UnicodeData.txt + * (You need UnicodeData.txt in the current directory) + * $ ./char_utils | sort -u | \ + * perl -e 'open(FH, "UnicodeData.txt"); @buf = ; close(FH); \ + * while(<>){/0x(\w*), 0x(\w*)/; @lines = grep(/^$1/, @buf); @cols = split(/;/, $lines[0]); \ + * print " { 0x$1, 0x$cols[13] }, // $cols[1]\n";}' + * + * 5. Update the SORTED_CHAR_MAP[] array below with the output above. + * Then, rebuild with -DCONFIRMING_CHAR_UTILS and confirm the program exits successfully. + * $ g++ -o char_utils -I.. -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc + * $ ./char_utils + * $ + */ +static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = { + { 0x00C0, 0x00E0 }, // LATIN CAPITAL LETTER A WITH GRAVE + { 0x00C1, 0x00E1 }, // LATIN CAPITAL LETTER A WITH ACUTE + { 0x00C2, 0x00E2 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX + { 0x00C3, 0x00E3 }, // LATIN CAPITAL LETTER A WITH TILDE + { 0x00C4, 0x00E4 }, // LATIN CAPITAL LETTER A WITH DIAERESIS + { 0x00C5, 0x00E5 }, // LATIN CAPITAL LETTER A WITH RING ABOVE + { 0x00C6, 0x00E6 }, // LATIN CAPITAL LETTER AE + { 0x00C7, 0x00E7 }, // LATIN CAPITAL LETTER C WITH CEDILLA + { 0x00C8, 0x00E8 }, // LATIN CAPITAL LETTER E WITH GRAVE + { 0x00C9, 0x00E9 }, // LATIN CAPITAL LETTER E WITH ACUTE + { 0x00CA, 0x00EA }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX + { 0x00CB, 0x00EB }, // LATIN CAPITAL LETTER E WITH DIAERESIS + { 0x00CC, 0x00EC }, // LATIN CAPITAL LETTER I WITH GRAVE + { 0x00CD, 0x00ED }, // LATIN CAPITAL LETTER I WITH ACUTE + { 0x00CE, 0x00EE }, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX + { 0x00CF, 0x00EF }, // LATIN CAPITAL LETTER I WITH DIAERESIS + { 0x00D0, 0x00F0 }, // LATIN CAPITAL LETTER ETH + { 0x00D1, 0x00F1 }, // LATIN CAPITAL LETTER N WITH TILDE + { 0x00D2, 0x00F2 }, // LATIN CAPITAL LETTER O WITH GRAVE + { 0x00D3, 0x00F3 }, // LATIN CAPITAL LETTER O WITH ACUTE + { 0x00D4, 0x00F4 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX + { 0x00D5, 0x00F5 }, // LATIN CAPITAL LETTER O WITH TILDE + { 0x00D6, 0x00F6 }, // LATIN CAPITAL LETTER O WITH DIAERESIS + { 0x00D8, 0x00F8 }, // LATIN CAPITAL LETTER O WITH STROKE + { 0x00D9, 0x00F9 }, // LATIN CAPITAL LETTER U WITH GRAVE + { 0x00DA, 0x00FA }, // LATIN CAPITAL LETTER U WITH ACUTE + { 0x00DB, 0x00FB }, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX + { 0x00DC, 0x00FC }, // LATIN CAPITAL LETTER U WITH DIAERESIS + { 0x00DD, 0x00FD }, // LATIN CAPITAL LETTER Y WITH ACUTE + { 0x00DE, 0x00FE }, // LATIN CAPITAL LETTER THORN + { 0x0100, 0x0101 }, // LATIN CAPITAL LETTER A WITH MACRON + { 0x0102, 0x0103 }, // LATIN CAPITAL LETTER A WITH BREVE + { 0x0104, 0x0105 }, // LATIN CAPITAL LETTER A WITH OGONEK + { 0x0106, 0x0107 }, // LATIN CAPITAL LETTER C WITH ACUTE + { 0x0108, 0x0109 }, // LATIN CAPITAL LETTER C WITH CIRCUMFLEX + { 0x010A, 0x010B }, // LATIN CAPITAL LETTER C WITH DOT ABOVE + { 0x010C, 0x010D }, // LATIN CAPITAL LETTER C WITH CARON + { 0x010E, 0x010F }, // LATIN CAPITAL LETTER D WITH CARON + { 0x0110, 0x0111 }, // LATIN CAPITAL LETTER D WITH STROKE + { 0x0112, 0x0113 }, // LATIN CAPITAL LETTER E WITH MACRON + { 0x0114, 0x0115 }, // LATIN CAPITAL LETTER E WITH BREVE + { 0x0116, 0x0117 }, // LATIN CAPITAL LETTER E WITH DOT ABOVE + { 0x0118, 0x0119 }, // LATIN CAPITAL LETTER E WITH OGONEK + { 0x011A, 0x011B }, // LATIN CAPITAL LETTER E WITH CARON + { 0x011C, 0x011D }, // LATIN CAPITAL LETTER G WITH CIRCUMFLEX + { 0x011E, 0x011F }, // LATIN CAPITAL LETTER G WITH BREVE + { 0x0120, 0x0121 }, // LATIN CAPITAL LETTER G WITH DOT ABOVE + { 0x0122, 0x0123 }, // LATIN CAPITAL LETTER G WITH CEDILLA + { 0x0124, 0x0125 }, // LATIN CAPITAL LETTER H WITH CIRCUMFLEX + { 0x0126, 0x0127 }, // LATIN CAPITAL LETTER H WITH STROKE + { 0x0128, 0x0129 }, // LATIN CAPITAL LETTER I WITH TILDE + { 0x012A, 0x012B }, // LATIN CAPITAL LETTER I WITH MACRON + { 0x012C, 0x012D }, // LATIN CAPITAL LETTER I WITH BREVE + { 0x012E, 0x012F }, // LATIN CAPITAL LETTER I WITH OGONEK + { 0x0130, 0x0069 }, // LATIN CAPITAL LETTER I WITH DOT ABOVE + { 0x0132, 0x0133 }, // LATIN CAPITAL LIGATURE IJ + { 0x0134, 0x0135 }, // LATIN CAPITAL LETTER J WITH CIRCUMFLEX + { 0x0136, 0x0137 }, // LATIN CAPITAL LETTER K WITH CEDILLA + { 0x0139, 0x013A }, // LATIN CAPITAL LETTER L WITH ACUTE + { 0x013B, 0x013C }, // LATIN CAPITAL LETTER L WITH CEDILLA + { 0x013D, 0x013E }, // LATIN CAPITAL LETTER L WITH CARON + { 0x013F, 0x0140 }, // LATIN CAPITAL LETTER L WITH MIDDLE DOT + { 0x0141, 0x0142 }, // LATIN CAPITAL LETTER L WITH STROKE + { 0x0143, 0x0144 }, // LATIN CAPITAL LETTER N WITH ACUTE + { 0x0145, 0x0146 }, // LATIN CAPITAL LETTER N WITH CEDILLA + { 0x0147, 0x0148 }, // LATIN CAPITAL LETTER N WITH CARON + { 0x014A, 0x014B }, // LATIN CAPITAL LETTER ENG + { 0x014C, 0x014D }, // LATIN CAPITAL LETTER O WITH MACRON + { 0x014E, 0x014F }, // LATIN CAPITAL LETTER O WITH BREVE + { 0x0150, 0x0151 }, // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE + { 0x0152, 0x0153 }, // LATIN CAPITAL LIGATURE OE + { 0x0154, 0x0155 }, // LATIN CAPITAL LETTER R WITH ACUTE + { 0x0156, 0x0157 }, // LATIN CAPITAL LETTER R WITH CEDILLA + { 0x0158, 0x0159 }, // LATIN CAPITAL LETTER R WITH CARON + { 0x015A, 0x015B }, // LATIN CAPITAL LETTER S WITH ACUTE + { 0x015C, 0x015D }, // LATIN CAPITAL LETTER S WITH CIRCUMFLEX + { 0x015E, 0x015F }, // LATIN CAPITAL LETTER S WITH CEDILLA + { 0x0160, 0x0161 }, // LATIN CAPITAL LETTER S WITH CARON + { 0x0162, 0x0163 }, // LATIN CAPITAL LETTER T WITH CEDILLA + { 0x0164, 0x0165 }, // LATIN CAPITAL LETTER T WITH CARON + { 0x0166, 0x0167 }, // LATIN CAPITAL LETTER T WITH STROKE + { 0x0168, 0x0169 }, // LATIN CAPITAL LETTER U WITH TILDE + { 0x016A, 0x016B }, // LATIN CAPITAL LETTER U WITH MACRON + { 0x016C, 0x016D }, // LATIN CAPITAL LETTER U WITH BREVE + { 0x016E, 0x016F }, // LATIN CAPITAL LETTER U WITH RING ABOVE + { 0x0170, 0x0171 }, // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE + { 0x0172, 0x0173 }, // LATIN CAPITAL LETTER U WITH OGONEK + { 0x0174, 0x0175 }, // LATIN CAPITAL LETTER W WITH CIRCUMFLEX + { 0x0176, 0x0177 }, // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX + { 0x0178, 0x00FF }, // LATIN CAPITAL LETTER Y WITH DIAERESIS + { 0x0179, 0x017A }, // LATIN CAPITAL LETTER Z WITH ACUTE + { 0x017B, 0x017C }, // LATIN CAPITAL LETTER Z WITH DOT ABOVE + { 0x017D, 0x017E }, // LATIN CAPITAL LETTER Z WITH CARON + { 0x0181, 0x0253 }, // LATIN CAPITAL LETTER B WITH HOOK + { 0x0182, 0x0183 }, // LATIN CAPITAL LETTER B WITH TOPBAR + { 0x0184, 0x0185 }, // LATIN CAPITAL LETTER TONE SIX + { 0x0186, 0x0254 }, // LATIN CAPITAL LETTER OPEN O + { 0x0187, 0x0188 }, // LATIN CAPITAL LETTER C WITH HOOK + { 0x0189, 0x0256 }, // LATIN CAPITAL LETTER AFRICAN D + { 0x018A, 0x0257 }, // LATIN CAPITAL LETTER D WITH HOOK + { 0x018B, 0x018C }, // LATIN CAPITAL LETTER D WITH TOPBAR + { 0x018E, 0x01DD }, // LATIN CAPITAL LETTER REVERSED E + { 0x018F, 0x0259 }, // LATIN CAPITAL LETTER SCHWA + { 0x0190, 0x025B }, // LATIN CAPITAL LETTER OPEN E + { 0x0191, 0x0192 }, // LATIN CAPITAL LETTER F WITH HOOK + { 0x0193, 0x0260 }, // LATIN CAPITAL LETTER G WITH HOOK + { 0x0194, 0x0263 }, // LATIN CAPITAL LETTER GAMMA + { 0x0196, 0x0269 }, // LATIN CAPITAL LETTER IOTA + { 0x0197, 0x0268 }, // LATIN CAPITAL LETTER I WITH STROKE + { 0x0198, 0x0199 }, // LATIN CAPITAL LETTER K WITH HOOK + { 0x019C, 0x026F }, // LATIN CAPITAL LETTER TURNED M + { 0x019D, 0x0272 }, // LATIN CAPITAL LETTER N WITH LEFT HOOK + { 0x019F, 0x0275 }, // LATIN CAPITAL LETTER O WITH MIDDLE TILDE + { 0x01A0, 0x01A1 }, // LATIN CAPITAL LETTER O WITH HORN + { 0x01A2, 0x01A3 }, // LATIN CAPITAL LETTER OI + { 0x01A4, 0x01A5 }, // LATIN CAPITAL LETTER P WITH HOOK + { 0x01A6, 0x0280 }, // LATIN LETTER YR + { 0x01A7, 0x01A8 }, // LATIN CAPITAL LETTER TONE TWO + { 0x01A9, 0x0283 }, // LATIN CAPITAL LETTER ESH + { 0x01AC, 0x01AD }, // LATIN CAPITAL LETTER T WITH HOOK + { 0x01AE, 0x0288 }, // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK + { 0x01AF, 0x01B0 }, // LATIN CAPITAL LETTER U WITH HORN + { 0x01B1, 0x028A }, // LATIN CAPITAL LETTER UPSILON + { 0x01B2, 0x028B }, // LATIN CAPITAL LETTER V WITH HOOK + { 0x01B3, 0x01B4 }, // LATIN CAPITAL LETTER Y WITH HOOK + { 0x01B5, 0x01B6 }, // LATIN CAPITAL LETTER Z WITH STROKE + { 0x01B7, 0x0292 }, // LATIN CAPITAL LETTER EZH + { 0x01B8, 0x01B9 }, // LATIN CAPITAL LETTER EZH REVERSED + { 0x01BC, 0x01BD }, // LATIN CAPITAL LETTER TONE FIVE + { 0x01C4, 0x01C6 }, // LATIN CAPITAL LETTER DZ WITH CARON + { 0x01C5, 0x01C6 }, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON + { 0x01C7, 0x01C9 }, // LATIN CAPITAL LETTER LJ + { 0x01C8, 0x01C9 }, // LATIN CAPITAL LETTER L WITH SMALL LETTER J + { 0x01CA, 0x01CC }, // LATIN CAPITAL LETTER NJ + { 0x01CB, 0x01CC }, // LATIN CAPITAL LETTER N WITH SMALL LETTER J + { 0x01CD, 0x01CE }, // LATIN CAPITAL LETTER A WITH CARON + { 0x01CF, 0x01D0 }, // LATIN CAPITAL LETTER I WITH CARON + { 0x01D1, 0x01D2 }, // LATIN CAPITAL LETTER O WITH CARON + { 0x01D3, 0x01D4 }, // LATIN CAPITAL LETTER U WITH CARON + { 0x01D5, 0x01D6 }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON + { 0x01D7, 0x01D8 }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE + { 0x01D9, 0x01DA }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON + { 0x01DB, 0x01DC }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE + { 0x01DE, 0x01DF }, // LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON + { 0x01E0, 0x01E1 }, // LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON + { 0x01E2, 0x01E3 }, // LATIN CAPITAL LETTER AE WITH MACRON + { 0x01E4, 0x01E5 }, // LATIN CAPITAL LETTER G WITH STROKE + { 0x01E6, 0x01E7 }, // LATIN CAPITAL LETTER G WITH CARON + { 0x01E8, 0x01E9 }, // LATIN CAPITAL LETTER K WITH CARON + { 0x01EA, 0x01EB }, // LATIN CAPITAL LETTER O WITH OGONEK + { 0x01EC, 0x01ED }, // LATIN CAPITAL LETTER O WITH OGONEK AND MACRON + { 0x01EE, 0x01EF }, // LATIN CAPITAL LETTER EZH WITH CARON + { 0x01F1, 0x01F3 }, // LATIN CAPITAL LETTER DZ + { 0x01F2, 0x01F3 }, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z + { 0x01F4, 0x01F5 }, // LATIN CAPITAL LETTER G WITH ACUTE + { 0x01F6, 0x0195 }, // LATIN CAPITAL LETTER HWAIR + { 0x01F7, 0x01BF }, // LATIN CAPITAL LETTER WYNN + { 0x01F8, 0x01F9 }, // LATIN CAPITAL LETTER N WITH GRAVE + { 0x01FA, 0x01FB }, // LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE + { 0x01FC, 0x01FD }, // LATIN CAPITAL LETTER AE WITH ACUTE + { 0x01FE, 0x01FF }, // LATIN CAPITAL LETTER O WITH STROKE AND ACUTE + { 0x0200, 0x0201 }, // LATIN CAPITAL LETTER A WITH DOUBLE GRAVE + { 0x0202, 0x0203 }, // LATIN CAPITAL LETTER A WITH INVERTED BREVE + { 0x0204, 0x0205 }, // LATIN CAPITAL LETTER E WITH DOUBLE GRAVE + { 0x0206, 0x0207 }, // LATIN CAPITAL LETTER E WITH INVERTED BREVE + { 0x0208, 0x0209 }, // LATIN CAPITAL LETTER I WITH DOUBLE GRAVE + { 0x020A, 0x020B }, // LATIN CAPITAL LETTER I WITH INVERTED BREVE + { 0x020C, 0x020D }, // LATIN CAPITAL LETTER O WITH DOUBLE GRAVE + { 0x020E, 0x020F }, // LATIN CAPITAL LETTER O WITH INVERTED BREVE + { 0x0210, 0x0211 }, // LATIN CAPITAL LETTER R WITH DOUBLE GRAVE + { 0x0212, 0x0213 }, // LATIN CAPITAL LETTER R WITH INVERTED BREVE + { 0x0214, 0x0215 }, // LATIN CAPITAL LETTER U WITH DOUBLE GRAVE + { 0x0216, 0x0217 }, // LATIN CAPITAL LETTER U WITH INVERTED BREVE + { 0x0218, 0x0219 }, // LATIN CAPITAL LETTER S WITH COMMA BELOW + { 0x021A, 0x021B }, // LATIN CAPITAL LETTER T WITH COMMA BELOW + { 0x021C, 0x021D }, // LATIN CAPITAL LETTER YOGH + { 0x021E, 0x021F }, // LATIN CAPITAL LETTER H WITH CARON + { 0x0220, 0x019E }, // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG + { 0x0222, 0x0223 }, // LATIN CAPITAL LETTER OU + { 0x0224, 0x0225 }, // LATIN CAPITAL LETTER Z WITH HOOK + { 0x0226, 0x0227 }, // LATIN CAPITAL LETTER A WITH DOT ABOVE + { 0x0228, 0x0229 }, // LATIN CAPITAL LETTER E WITH CEDILLA + { 0x022A, 0x022B }, // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON + { 0x022C, 0x022D }, // LATIN CAPITAL LETTER O WITH TILDE AND MACRON + { 0x022E, 0x022F }, // LATIN CAPITAL LETTER O WITH DOT ABOVE + { 0x0230, 0x0231 }, // LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON + { 0x0232, 0x0233 }, // LATIN CAPITAL LETTER Y WITH MACRON + { 0x023A, 0x2C65 }, // LATIN CAPITAL LETTER A WITH STROKE + { 0x023B, 0x023C }, // LATIN CAPITAL LETTER C WITH STROKE + { 0x023D, 0x019A }, // LATIN CAPITAL LETTER L WITH BAR + { 0x023E, 0x2C66 }, // LATIN CAPITAL LETTER T WITH DIAGONAL STROKE + { 0x0241, 0x0242 }, // LATIN CAPITAL LETTER GLOTTAL STOP + { 0x0243, 0x0180 }, // LATIN CAPITAL LETTER B WITH STROKE + { 0x0244, 0x0289 }, // LATIN CAPITAL LETTER U BAR + { 0x0245, 0x028C }, // LATIN CAPITAL LETTER TURNED V + { 0x0246, 0x0247 }, // LATIN CAPITAL LETTER E WITH STROKE + { 0x0248, 0x0249 }, // LATIN CAPITAL LETTER J WITH STROKE + { 0x024A, 0x024B }, // LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL + { 0x024C, 0x024D }, // LATIN CAPITAL LETTER R WITH STROKE + { 0x024E, 0x024F }, // LATIN CAPITAL LETTER Y WITH STROKE + { 0x0370, 0x0371 }, // GREEK CAPITAL LETTER HETA + { 0x0372, 0x0373 }, // GREEK CAPITAL LETTER ARCHAIC SAMPI + { 0x0376, 0x0377 }, // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA + { 0x0386, 0x03AC }, // GREEK CAPITAL LETTER ALPHA WITH TONOS + { 0x0388, 0x03AD }, // GREEK CAPITAL LETTER EPSILON WITH TONOS + { 0x0389, 0x03AE }, // GREEK CAPITAL LETTER ETA WITH TONOS + { 0x038A, 0x03AF }, // GREEK CAPITAL LETTER IOTA WITH TONOS + { 0x038C, 0x03CC }, // GREEK CAPITAL LETTER OMICRON WITH TONOS + { 0x038E, 0x03CD }, // GREEK CAPITAL LETTER UPSILON WITH TONOS + { 0x038F, 0x03CE }, // GREEK CAPITAL LETTER OMEGA WITH TONOS + { 0x0391, 0x03B1 }, // GREEK CAPITAL LETTER ALPHA + { 0x0392, 0x03B2 }, // GREEK CAPITAL LETTER BETA + { 0x0393, 0x03B3 }, // GREEK CAPITAL LETTER GAMMA + { 0x0394, 0x03B4 }, // GREEK CAPITAL LETTER DELTA + { 0x0395, 0x03B5 }, // GREEK CAPITAL LETTER EPSILON + { 0x0396, 0x03B6 }, // GREEK CAPITAL LETTER ZETA + { 0x0397, 0x03B7 }, // GREEK CAPITAL LETTER ETA + { 0x0398, 0x03B8 }, // GREEK CAPITAL LETTER THETA + { 0x0399, 0x03B9 }, // GREEK CAPITAL LETTER IOTA + { 0x039A, 0x03BA }, // GREEK CAPITAL LETTER KAPPA + { 0x039B, 0x03BB }, // GREEK CAPITAL LETTER LAMDA + { 0x039C, 0x03BC }, // GREEK CAPITAL LETTER MU + { 0x039D, 0x03BD }, // GREEK CAPITAL LETTER NU + { 0x039E, 0x03BE }, // GREEK CAPITAL LETTER XI + { 0x039F, 0x03BF }, // GREEK CAPITAL LETTER OMICRON + { 0x03A0, 0x03C0 }, // GREEK CAPITAL LETTER PI + { 0x03A1, 0x03C1 }, // GREEK CAPITAL LETTER RHO + { 0x03A3, 0x03C3 }, // GREEK CAPITAL LETTER SIGMA + { 0x03A4, 0x03C4 }, // GREEK CAPITAL LETTER TAU + { 0x03A5, 0x03C5 }, // GREEK CAPITAL LETTER UPSILON + { 0x03A6, 0x03C6 }, // GREEK CAPITAL LETTER PHI + { 0x03A7, 0x03C7 }, // GREEK CAPITAL LETTER CHI + { 0x03A8, 0x03C8 }, // GREEK CAPITAL LETTER PSI + { 0x03A9, 0x03C9 }, // GREEK CAPITAL LETTER OMEGA + { 0x03AA, 0x03CA }, // GREEK CAPITAL LETTER IOTA WITH DIALYTIKA + { 0x03AB, 0x03CB }, // GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA + { 0x03CF, 0x03D7 }, // GREEK CAPITAL KAI SYMBOL + { 0x03D8, 0x03D9 }, // GREEK LETTER ARCHAIC KOPPA + { 0x03DA, 0x03DB }, // GREEK LETTER STIGMA + { 0x03DC, 0x03DD }, // GREEK LETTER DIGAMMA + { 0x03DE, 0x03DF }, // GREEK LETTER KOPPA + { 0x03E0, 0x03E1 }, // GREEK LETTER SAMPI + { 0x03E2, 0x03E3 }, // COPTIC CAPITAL LETTER SHEI + { 0x03E4, 0x03E5 }, // COPTIC CAPITAL LETTER FEI + { 0x03E6, 0x03E7 }, // COPTIC CAPITAL LETTER KHEI + { 0x03E8, 0x03E9 }, // COPTIC CAPITAL LETTER HORI + { 0x03EA, 0x03EB }, // COPTIC CAPITAL LETTER GANGIA + { 0x03EC, 0x03ED }, // COPTIC CAPITAL LETTER SHIMA + { 0x03EE, 0x03EF }, // COPTIC CAPITAL LETTER DEI + { 0x03F4, 0x03B8 }, // GREEK CAPITAL THETA SYMBOL + { 0x03F7, 0x03F8 }, // GREEK CAPITAL LETTER SHO + { 0x03F9, 0x03F2 }, // GREEK CAPITAL LUNATE SIGMA SYMBOL + { 0x03FA, 0x03FB }, // GREEK CAPITAL LETTER SAN + { 0x03FD, 0x037B }, // GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL + { 0x03FE, 0x037C }, // GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL + { 0x03FF, 0x037D }, // GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL + { 0x0400, 0x0450 }, // CYRILLIC CAPITAL LETTER IE WITH GRAVE + { 0x0401, 0x0451 }, // CYRILLIC CAPITAL LETTER IO + { 0x0402, 0x0452 }, // CYRILLIC CAPITAL LETTER DJE + { 0x0403, 0x0453 }, // CYRILLIC CAPITAL LETTER GJE + { 0x0404, 0x0454 }, // CYRILLIC CAPITAL LETTER UKRAINIAN IE + { 0x0405, 0x0455 }, // CYRILLIC CAPITAL LETTER DZE + { 0x0406, 0x0456 }, // CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I + { 0x0407, 0x0457 }, // CYRILLIC CAPITAL LETTER YI + { 0x0408, 0x0458 }, // CYRILLIC CAPITAL LETTER JE + { 0x0409, 0x0459 }, // CYRILLIC CAPITAL LETTER LJE + { 0x040A, 0x045A }, // CYRILLIC CAPITAL LETTER NJE + { 0x040B, 0x045B }, // CYRILLIC CAPITAL LETTER TSHE + { 0x040C, 0x045C }, // CYRILLIC CAPITAL LETTER KJE + { 0x040D, 0x045D }, // CYRILLIC CAPITAL LETTER I WITH GRAVE + { 0x040E, 0x045E }, // CYRILLIC CAPITAL LETTER SHORT U + { 0x040F, 0x045F }, // CYRILLIC CAPITAL LETTER DZHE + { 0x0410, 0x0430 }, // CYRILLIC CAPITAL LETTER A + { 0x0411, 0x0431 }, // CYRILLIC CAPITAL LETTER BE + { 0x0412, 0x0432 }, // CYRILLIC CAPITAL LETTER VE + { 0x0413, 0x0433 }, // CYRILLIC CAPITAL LETTER GHE + { 0x0414, 0x0434 }, // CYRILLIC CAPITAL LETTER DE + { 0x0415, 0x0435 }, // CYRILLIC CAPITAL LETTER IE + { 0x0416, 0x0436 }, // CYRILLIC CAPITAL LETTER ZHE + { 0x0417, 0x0437 }, // CYRILLIC CAPITAL LETTER ZE + { 0x0418, 0x0438 }, // CYRILLIC CAPITAL LETTER I + { 0x0419, 0x0439 }, // CYRILLIC CAPITAL LETTER SHORT I + { 0x041A, 0x043A }, // CYRILLIC CAPITAL LETTER KA + { 0x041B, 0x043B }, // CYRILLIC CAPITAL LETTER EL + { 0x041C, 0x043C }, // CYRILLIC CAPITAL LETTER EM + { 0x041D, 0x043D }, // CYRILLIC CAPITAL LETTER EN + { 0x041E, 0x043E }, // CYRILLIC CAPITAL LETTER O + { 0x041F, 0x043F }, // CYRILLIC CAPITAL LETTER PE + { 0x0420, 0x0440 }, // CYRILLIC CAPITAL LETTER ER + { 0x0421, 0x0441 }, // CYRILLIC CAPITAL LETTER ES + { 0x0422, 0x0442 }, // CYRILLIC CAPITAL LETTER TE + { 0x0423, 0x0443 }, // CYRILLIC CAPITAL LETTER U + { 0x0424, 0x0444 }, // CYRILLIC CAPITAL LETTER EF + { 0x0425, 0x0445 }, // CYRILLIC CAPITAL LETTER HA + { 0x0426, 0x0446 }, // CYRILLIC CAPITAL LETTER TSE + { 0x0427, 0x0447 }, // CYRILLIC CAPITAL LETTER CHE + { 0x0428, 0x0448 }, // CYRILLIC CAPITAL LETTER SHA + { 0x0429, 0x0449 }, // CYRILLIC CAPITAL LETTER SHCHA + { 0x042A, 0x044A }, // CYRILLIC CAPITAL LETTER HARD SIGN + { 0x042B, 0x044B }, // CYRILLIC CAPITAL LETTER YERU + { 0x042C, 0x044C }, // CYRILLIC CAPITAL LETTER SOFT SIGN + { 0x042D, 0x044D }, // CYRILLIC CAPITAL LETTER E + { 0x042E, 0x044E }, // CYRILLIC CAPITAL LETTER YU + { 0x042F, 0x044F }, // CYRILLIC CAPITAL LETTER YA + { 0x0460, 0x0461 }, // CYRILLIC CAPITAL LETTER OMEGA + { 0x0462, 0x0463 }, // CYRILLIC CAPITAL LETTER YAT + { 0x0464, 0x0465 }, // CYRILLIC CAPITAL LETTER IOTIFIED E + { 0x0466, 0x0467 }, // CYRILLIC CAPITAL LETTER LITTLE YUS + { 0x0468, 0x0469 }, // CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS + { 0x046A, 0x046B }, // CYRILLIC CAPITAL LETTER BIG YUS + { 0x046C, 0x046D }, // CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS + { 0x046E, 0x046F }, // CYRILLIC CAPITAL LETTER KSI + { 0x0470, 0x0471 }, // CYRILLIC CAPITAL LETTER PSI + { 0x0472, 0x0473 }, // CYRILLIC CAPITAL LETTER FITA + { 0x0474, 0x0475 }, // CYRILLIC CAPITAL LETTER IZHITSA + { 0x0476, 0x0477 }, // CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT + { 0x0478, 0x0479 }, // CYRILLIC CAPITAL LETTER UK + { 0x047A, 0x047B }, // CYRILLIC CAPITAL LETTER ROUND OMEGA + { 0x047C, 0x047D }, // CYRILLIC CAPITAL LETTER OMEGA WITH TITLO + { 0x047E, 0x047F }, // CYRILLIC CAPITAL LETTER OT + { 0x0480, 0x0481 }, // CYRILLIC CAPITAL LETTER KOPPA + { 0x048A, 0x048B }, // CYRILLIC CAPITAL LETTER SHORT I WITH TAIL + { 0x048C, 0x048D }, // CYRILLIC CAPITAL LETTER SEMISOFT SIGN + { 0x048E, 0x048F }, // CYRILLIC CAPITAL LETTER ER WITH TICK + { 0x0490, 0x0491 }, // CYRILLIC CAPITAL LETTER GHE WITH UPTURN + { 0x0492, 0x0493 }, // CYRILLIC CAPITAL LETTER GHE WITH STROKE + { 0x0494, 0x0495 }, // CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK + { 0x0496, 0x0497 }, // CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER + { 0x0498, 0x0499 }, // CYRILLIC CAPITAL LETTER ZE WITH DESCENDER + { 0x049A, 0x049B }, // CYRILLIC CAPITAL LETTER KA WITH DESCENDER + { 0x049C, 0x049D }, // CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE + { 0x049E, 0x049F }, // CYRILLIC CAPITAL LETTER KA WITH STROKE + { 0x04A0, 0x04A1 }, // CYRILLIC CAPITAL LETTER BASHKIR KA + { 0x04A2, 0x04A3 }, // CYRILLIC CAPITAL LETTER EN WITH DESCENDER + { 0x04A4, 0x04A5 }, // CYRILLIC CAPITAL LIGATURE EN GHE + { 0x04A6, 0x04A7 }, // CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK + { 0x04A8, 0x04A9 }, // CYRILLIC CAPITAL LETTER ABKHASIAN HA + { 0x04AA, 0x04AB }, // CYRILLIC CAPITAL LETTER ES WITH DESCENDER + { 0x04AC, 0x04AD }, // CYRILLIC CAPITAL LETTER TE WITH DESCENDER + { 0x04AE, 0x04AF }, // CYRILLIC CAPITAL LETTER STRAIGHT U + { 0x04B0, 0x04B1 }, // CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE + { 0x04B2, 0x04B3 }, // CYRILLIC CAPITAL LETTER HA WITH DESCENDER + { 0x04B4, 0x04B5 }, // CYRILLIC CAPITAL LIGATURE TE TSE + { 0x04B6, 0x04B7 }, // CYRILLIC CAPITAL LETTER CHE WITH DESCENDER + { 0x04B8, 0x04B9 }, // CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE + { 0x04BA, 0x04BB }, // CYRILLIC CAPITAL LETTER SHHA + { 0x04BC, 0x04BD }, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE + { 0x04BE, 0x04BF }, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER + { 0x04C0, 0x04CF }, // CYRILLIC LETTER PALOCHKA + { 0x04C1, 0x04C2 }, // CYRILLIC CAPITAL LETTER ZHE WITH BREVE + { 0x04C3, 0x04C4 }, // CYRILLIC CAPITAL LETTER KA WITH HOOK + { 0x04C5, 0x04C6 }, // CYRILLIC CAPITAL LETTER EL WITH TAIL + { 0x04C7, 0x04C8 }, // CYRILLIC CAPITAL LETTER EN WITH HOOK + { 0x04C9, 0x04CA }, // CYRILLIC CAPITAL LETTER EN WITH TAIL + { 0x04CB, 0x04CC }, // CYRILLIC CAPITAL LETTER KHAKASSIAN CHE + { 0x04CD, 0x04CE }, // CYRILLIC CAPITAL LETTER EM WITH TAIL + { 0x04D0, 0x04D1 }, // CYRILLIC CAPITAL LETTER A WITH BREVE + { 0x04D2, 0x04D3 }, // CYRILLIC CAPITAL LETTER A WITH DIAERESIS + { 0x04D4, 0x04D5 }, // CYRILLIC CAPITAL LIGATURE A IE + { 0x04D6, 0x04D7 }, // CYRILLIC CAPITAL LETTER IE WITH BREVE + { 0x04D8, 0x04D9 }, // CYRILLIC CAPITAL LETTER SCHWA + { 0x04DA, 0x04DB }, // CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS + { 0x04DC, 0x04DD }, // CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS + { 0x04DE, 0x04DF }, // CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS + { 0x04E0, 0x04E1 }, // CYRILLIC CAPITAL LETTER ABKHASIAN DZE + { 0x04E2, 0x04E3 }, // CYRILLIC CAPITAL LETTER I WITH MACRON + { 0x04E4, 0x04E5 }, // CYRILLIC CAPITAL LETTER I WITH DIAERESIS + { 0x04E6, 0x04E7 }, // CYRILLIC CAPITAL LETTER O WITH DIAERESIS + { 0x04E8, 0x04E9 }, // CYRILLIC CAPITAL LETTER BARRED O + { 0x04EA, 0x04EB }, // CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS + { 0x04EC, 0x04ED }, // CYRILLIC CAPITAL LETTER E WITH DIAERESIS + { 0x04EE, 0x04EF }, // CYRILLIC CAPITAL LETTER U WITH MACRON + { 0x04F0, 0x04F1 }, // CYRILLIC CAPITAL LETTER U WITH DIAERESIS + { 0x04F2, 0x04F3 }, // CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE + { 0x04F4, 0x04F5 }, // CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS + { 0x04F6, 0x04F7 }, // CYRILLIC CAPITAL LETTER GHE WITH DESCENDER + { 0x04F8, 0x04F9 }, // CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS + { 0x04FA, 0x04FB }, // CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK + { 0x04FC, 0x04FD }, // CYRILLIC CAPITAL LETTER HA WITH HOOK + { 0x04FE, 0x04FF }, // CYRILLIC CAPITAL LETTER HA WITH STROKE + { 0x0500, 0x0501 }, // CYRILLIC CAPITAL LETTER KOMI DE + { 0x0502, 0x0503 }, // CYRILLIC CAPITAL LETTER KOMI DJE + { 0x0504, 0x0505 }, // CYRILLIC CAPITAL LETTER KOMI ZJE + { 0x0506, 0x0507 }, // CYRILLIC CAPITAL LETTER KOMI DZJE + { 0x0508, 0x0509 }, // CYRILLIC CAPITAL LETTER KOMI LJE + { 0x050A, 0x050B }, // CYRILLIC CAPITAL LETTER KOMI NJE + { 0x050C, 0x050D }, // CYRILLIC CAPITAL LETTER KOMI SJE + { 0x050E, 0x050F }, // CYRILLIC CAPITAL LETTER KOMI TJE + { 0x0510, 0x0511 }, // CYRILLIC CAPITAL LETTER REVERSED ZE + { 0x0512, 0x0513 }, // CYRILLIC CAPITAL LETTER EL WITH HOOK + { 0x0514, 0x0515 }, // CYRILLIC CAPITAL LETTER LHA + { 0x0516, 0x0517 }, // CYRILLIC CAPITAL LETTER RHA + { 0x0518, 0x0519 }, // CYRILLIC CAPITAL LETTER YAE + { 0x051A, 0x051B }, // CYRILLIC CAPITAL LETTER QA + { 0x051C, 0x051D }, // CYRILLIC CAPITAL LETTER WE + { 0x051E, 0x051F }, // CYRILLIC CAPITAL LETTER ALEUT KA + { 0x0520, 0x0521 }, // CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK + { 0x0522, 0x0523 }, // CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK + { 0x0524, 0x0525 }, // CYRILLIC CAPITAL LETTER PE WITH DESCENDER + { 0x0526, 0x0527 }, // CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER + { 0x0531, 0x0561 }, // ARMENIAN CAPITAL LETTER AYB + { 0x0532, 0x0562 }, // ARMENIAN CAPITAL LETTER BEN + { 0x0533, 0x0563 }, // ARMENIAN CAPITAL LETTER GIM + { 0x0534, 0x0564 }, // ARMENIAN CAPITAL LETTER DA + { 0x0535, 0x0565 }, // ARMENIAN CAPITAL LETTER ECH + { 0x0536, 0x0566 }, // ARMENIAN CAPITAL LETTER ZA + { 0x0537, 0x0567 }, // ARMENIAN CAPITAL LETTER EH + { 0x0538, 0x0568 }, // ARMENIAN CAPITAL LETTER ET + { 0x0539, 0x0569 }, // ARMENIAN CAPITAL LETTER TO + { 0x053A, 0x056A }, // ARMENIAN CAPITAL LETTER ZHE + { 0x053B, 0x056B }, // ARMENIAN CAPITAL LETTER INI + { 0x053C, 0x056C }, // ARMENIAN CAPITAL LETTER LIWN + { 0x053D, 0x056D }, // ARMENIAN CAPITAL LETTER XEH + { 0x053E, 0x056E }, // ARMENIAN CAPITAL LETTER CA + { 0x053F, 0x056F }, // ARMENIAN CAPITAL LETTER KEN + { 0x0540, 0x0570 }, // ARMENIAN CAPITAL LETTER HO + { 0x0541, 0x0571 }, // ARMENIAN CAPITAL LETTER JA + { 0x0542, 0x0572 }, // ARMENIAN CAPITAL LETTER GHAD + { 0x0543, 0x0573 }, // ARMENIAN CAPITAL LETTER CHEH + { 0x0544, 0x0574 }, // ARMENIAN CAPITAL LETTER MEN + { 0x0545, 0x0575 }, // ARMENIAN CAPITAL LETTER YI + { 0x0546, 0x0576 }, // ARMENIAN CAPITAL LETTER NOW + { 0x0547, 0x0577 }, // ARMENIAN CAPITAL LETTER SHA + { 0x0548, 0x0578 }, // ARMENIAN CAPITAL LETTER VO + { 0x0549, 0x0579 }, // ARMENIAN CAPITAL LETTER CHA + { 0x054A, 0x057A }, // ARMENIAN CAPITAL LETTER PEH + { 0x054B, 0x057B }, // ARMENIAN CAPITAL LETTER JHEH + { 0x054C, 0x057C }, // ARMENIAN CAPITAL LETTER RA + { 0x054D, 0x057D }, // ARMENIAN CAPITAL LETTER SEH + { 0x054E, 0x057E }, // ARMENIAN CAPITAL LETTER VEW + { 0x054F, 0x057F }, // ARMENIAN CAPITAL LETTER TIWN + { 0x0550, 0x0580 }, // ARMENIAN CAPITAL LETTER REH + { 0x0551, 0x0581 }, // ARMENIAN CAPITAL LETTER CO + { 0x0552, 0x0582 }, // ARMENIAN CAPITAL LETTER YIWN + { 0x0553, 0x0583 }, // ARMENIAN CAPITAL LETTER PIWR + { 0x0554, 0x0584 }, // ARMENIAN CAPITAL LETTER KEH + { 0x0555, 0x0585 }, // ARMENIAN CAPITAL LETTER OH + { 0x0556, 0x0586 }, // ARMENIAN CAPITAL LETTER FEH + { 0x10A0, 0x2D00 }, // GEORGIAN CAPITAL LETTER AN + { 0x10A1, 0x2D01 }, // GEORGIAN CAPITAL LETTER BAN + { 0x10A2, 0x2D02 }, // GEORGIAN CAPITAL LETTER GAN + { 0x10A3, 0x2D03 }, // GEORGIAN CAPITAL LETTER DON + { 0x10A4, 0x2D04 }, // GEORGIAN CAPITAL LETTER EN + { 0x10A5, 0x2D05 }, // GEORGIAN CAPITAL LETTER VIN + { 0x10A6, 0x2D06 }, // GEORGIAN CAPITAL LETTER ZEN + { 0x10A7, 0x2D07 }, // GEORGIAN CAPITAL LETTER TAN + { 0x10A8, 0x2D08 }, // GEORGIAN CAPITAL LETTER IN + { 0x10A9, 0x2D09 }, // GEORGIAN CAPITAL LETTER KAN + { 0x10AA, 0x2D0A }, // GEORGIAN CAPITAL LETTER LAS + { 0x10AB, 0x2D0B }, // GEORGIAN CAPITAL LETTER MAN + { 0x10AC, 0x2D0C }, // GEORGIAN CAPITAL LETTER NAR + { 0x10AD, 0x2D0D }, // GEORGIAN CAPITAL LETTER ON + { 0x10AE, 0x2D0E }, // GEORGIAN CAPITAL LETTER PAR + { 0x10AF, 0x2D0F }, // GEORGIAN CAPITAL LETTER ZHAR + { 0x10B0, 0x2D10 }, // GEORGIAN CAPITAL LETTER RAE + { 0x10B1, 0x2D11 }, // GEORGIAN CAPITAL LETTER SAN + { 0x10B2, 0x2D12 }, // GEORGIAN CAPITAL LETTER TAR + { 0x10B3, 0x2D13 }, // GEORGIAN CAPITAL LETTER UN + { 0x10B4, 0x2D14 }, // GEORGIAN CAPITAL LETTER PHAR + { 0x10B5, 0x2D15 }, // GEORGIAN CAPITAL LETTER KHAR + { 0x10B6, 0x2D16 }, // GEORGIAN CAPITAL LETTER GHAN + { 0x10B7, 0x2D17 }, // GEORGIAN CAPITAL LETTER QAR + { 0x10B8, 0x2D18 }, // GEORGIAN CAPITAL LETTER SHIN + { 0x10B9, 0x2D19 }, // GEORGIAN CAPITAL LETTER CHIN + { 0x10BA, 0x2D1A }, // GEORGIAN CAPITAL LETTER CAN + { 0x10BB, 0x2D1B }, // GEORGIAN CAPITAL LETTER JIL + { 0x10BC, 0x2D1C }, // GEORGIAN CAPITAL LETTER CIL + { 0x10BD, 0x2D1D }, // GEORGIAN CAPITAL LETTER CHAR + { 0x10BE, 0x2D1E }, // GEORGIAN CAPITAL LETTER XAN + { 0x10BF, 0x2D1F }, // GEORGIAN CAPITAL LETTER JHAN + { 0x10C0, 0x2D20 }, // GEORGIAN CAPITAL LETTER HAE + { 0x10C1, 0x2D21 }, // GEORGIAN CAPITAL LETTER HE + { 0x10C2, 0x2D22 }, // GEORGIAN CAPITAL LETTER HIE + { 0x10C3, 0x2D23 }, // GEORGIAN CAPITAL LETTER WE + { 0x10C4, 0x2D24 }, // GEORGIAN CAPITAL LETTER HAR + { 0x10C5, 0x2D25 }, // GEORGIAN CAPITAL LETTER HOE + { 0x1E00, 0x1E01 }, // LATIN CAPITAL LETTER A WITH RING BELOW + { 0x1E02, 0x1E03 }, // LATIN CAPITAL LETTER B WITH DOT ABOVE + { 0x1E04, 0x1E05 }, // LATIN CAPITAL LETTER B WITH DOT BELOW + { 0x1E06, 0x1E07 }, // LATIN CAPITAL LETTER B WITH LINE BELOW + { 0x1E08, 0x1E09 }, // LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE + { 0x1E0A, 0x1E0B }, // LATIN CAPITAL LETTER D WITH DOT ABOVE + { 0x1E0C, 0x1E0D }, // LATIN CAPITAL LETTER D WITH DOT BELOW + { 0x1E0E, 0x1E0F }, // LATIN CAPITAL LETTER D WITH LINE BELOW + { 0x1E10, 0x1E11 }, // LATIN CAPITAL LETTER D WITH CEDILLA + { 0x1E12, 0x1E13 }, // LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW + { 0x1E14, 0x1E15 }, // LATIN CAPITAL LETTER E WITH MACRON AND GRAVE + { 0x1E16, 0x1E17 }, // LATIN CAPITAL LETTER E WITH MACRON AND ACUTE + { 0x1E18, 0x1E19 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW + { 0x1E1A, 0x1E1B }, // LATIN CAPITAL LETTER E WITH TILDE BELOW + { 0x1E1C, 0x1E1D }, // LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE + { 0x1E1E, 0x1E1F }, // LATIN CAPITAL LETTER F WITH DOT ABOVE + { 0x1E20, 0x1E21 }, // LATIN CAPITAL LETTER G WITH MACRON + { 0x1E22, 0x1E23 }, // LATIN CAPITAL LETTER H WITH DOT ABOVE + { 0x1E24, 0x1E25 }, // LATIN CAPITAL LETTER H WITH DOT BELOW + { 0x1E26, 0x1E27 }, // LATIN CAPITAL LETTER H WITH DIAERESIS + { 0x1E28, 0x1E29 }, // LATIN CAPITAL LETTER H WITH CEDILLA + { 0x1E2A, 0x1E2B }, // LATIN CAPITAL LETTER H WITH BREVE BELOW + { 0x1E2C, 0x1E2D }, // LATIN CAPITAL LETTER I WITH TILDE BELOW + { 0x1E2E, 0x1E2F }, // LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE + { 0x1E30, 0x1E31 }, // LATIN CAPITAL LETTER K WITH ACUTE + { 0x1E32, 0x1E33 }, // LATIN CAPITAL LETTER K WITH DOT BELOW + { 0x1E34, 0x1E35 }, // LATIN CAPITAL LETTER K WITH LINE BELOW + { 0x1E36, 0x1E37 }, // LATIN CAPITAL LETTER L WITH DOT BELOW + { 0x1E38, 0x1E39 }, // LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON + { 0x1E3A, 0x1E3B }, // LATIN CAPITAL LETTER L WITH LINE BELOW + { 0x1E3C, 0x1E3D }, // LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW + { 0x1E3E, 0x1E3F }, // LATIN CAPITAL LETTER M WITH ACUTE + { 0x1E40, 0x1E41 }, // LATIN CAPITAL LETTER M WITH DOT ABOVE + { 0x1E42, 0x1E43 }, // LATIN CAPITAL LETTER M WITH DOT BELOW + { 0x1E44, 0x1E45 }, // LATIN CAPITAL LETTER N WITH DOT ABOVE + { 0x1E46, 0x1E47 }, // LATIN CAPITAL LETTER N WITH DOT BELOW + { 0x1E48, 0x1E49 }, // LATIN CAPITAL LETTER N WITH LINE BELOW + { 0x1E4A, 0x1E4B }, // LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW + { 0x1E4C, 0x1E4D }, // LATIN CAPITAL LETTER O WITH TILDE AND ACUTE + { 0x1E4E, 0x1E4F }, // LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS + { 0x1E50, 0x1E51 }, // LATIN CAPITAL LETTER O WITH MACRON AND GRAVE + { 0x1E52, 0x1E53 }, // LATIN CAPITAL LETTER O WITH MACRON AND ACUTE + { 0x1E54, 0x1E55 }, // LATIN CAPITAL LETTER P WITH ACUTE + { 0x1E56, 0x1E57 }, // LATIN CAPITAL LETTER P WITH DOT ABOVE + { 0x1E58, 0x1E59 }, // LATIN CAPITAL LETTER R WITH DOT ABOVE + { 0x1E5A, 0x1E5B }, // LATIN CAPITAL LETTER R WITH DOT BELOW + { 0x1E5C, 0x1E5D }, // LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON + { 0x1E5E, 0x1E5F }, // LATIN CAPITAL LETTER R WITH LINE BELOW + { 0x1E60, 0x1E61 }, // LATIN CAPITAL LETTER S WITH DOT ABOVE + { 0x1E62, 0x1E63 }, // LATIN CAPITAL LETTER S WITH DOT BELOW + { 0x1E64, 0x1E65 }, // LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE + { 0x1E66, 0x1E67 }, // LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE + { 0x1E68, 0x1E69 }, // LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE + { 0x1E6A, 0x1E6B }, // LATIN CAPITAL LETTER T WITH DOT ABOVE + { 0x1E6C, 0x1E6D }, // LATIN CAPITAL LETTER T WITH DOT BELOW + { 0x1E6E, 0x1E6F }, // LATIN CAPITAL LETTER T WITH LINE BELOW + { 0x1E70, 0x1E71 }, // LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW + { 0x1E72, 0x1E73 }, // LATIN CAPITAL LETTER U WITH DIAERESIS BELOW + { 0x1E74, 0x1E75 }, // LATIN CAPITAL LETTER U WITH TILDE BELOW + { 0x1E76, 0x1E77 }, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW + { 0x1E78, 0x1E79 }, // LATIN CAPITAL LETTER U WITH TILDE AND ACUTE + { 0x1E7A, 0x1E7B }, // LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS + { 0x1E7C, 0x1E7D }, // LATIN CAPITAL LETTER V WITH TILDE + { 0x1E7E, 0x1E7F }, // LATIN CAPITAL LETTER V WITH DOT BELOW + { 0x1E80, 0x1E81 }, // LATIN CAPITAL LETTER W WITH GRAVE + { 0x1E82, 0x1E83 }, // LATIN CAPITAL LETTER W WITH ACUTE + { 0x1E84, 0x1E85 }, // LATIN CAPITAL LETTER W WITH DIAERESIS + { 0x1E86, 0x1E87 }, // LATIN CAPITAL LETTER W WITH DOT ABOVE + { 0x1E88, 0x1E89 }, // LATIN CAPITAL LETTER W WITH DOT BELOW + { 0x1E8A, 0x1E8B }, // LATIN CAPITAL LETTER X WITH DOT ABOVE + { 0x1E8C, 0x1E8D }, // LATIN CAPITAL LETTER X WITH DIAERESIS + { 0x1E8E, 0x1E8F }, // LATIN CAPITAL LETTER Y WITH DOT ABOVE + { 0x1E90, 0x1E91 }, // LATIN CAPITAL LETTER Z WITH CIRCUMFLEX + { 0x1E92, 0x1E93 }, // LATIN CAPITAL LETTER Z WITH DOT BELOW + { 0x1E94, 0x1E95 }, // LATIN CAPITAL LETTER Z WITH LINE BELOW + { 0x1E9E, 0x00DF }, // LATIN CAPITAL LETTER SHARP S + { 0x1EA0, 0x1EA1 }, // LATIN CAPITAL LETTER A WITH DOT BELOW + { 0x1EA2, 0x1EA3 }, // LATIN CAPITAL LETTER A WITH HOOK ABOVE + { 0x1EA4, 0x1EA5 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE + { 0x1EA6, 0x1EA7 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE + { 0x1EA8, 0x1EA9 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE + { 0x1EAA, 0x1EAB }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE + { 0x1EAC, 0x1EAD }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW + { 0x1EAE, 0x1EAF }, // LATIN CAPITAL LETTER A WITH BREVE AND ACUTE + { 0x1EB0, 0x1EB1 }, // LATIN CAPITAL LETTER A WITH BREVE AND GRAVE + { 0x1EB2, 0x1EB3 }, // LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE + { 0x1EB4, 0x1EB5 }, // LATIN CAPITAL LETTER A WITH BREVE AND TILDE + { 0x1EB6, 0x1EB7 }, // LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW + { 0x1EB8, 0x1EB9 }, // LATIN CAPITAL LETTER E WITH DOT BELOW + { 0x1EBA, 0x1EBB }, // LATIN CAPITAL LETTER E WITH HOOK ABOVE + { 0x1EBC, 0x1EBD }, // LATIN CAPITAL LETTER E WITH TILDE + { 0x1EBE, 0x1EBF }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE + { 0x1EC0, 0x1EC1 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE + { 0x1EC2, 0x1EC3 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE + { 0x1EC4, 0x1EC5 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE + { 0x1EC6, 0x1EC7 }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW + { 0x1EC8, 0x1EC9 }, // LATIN CAPITAL LETTER I WITH HOOK ABOVE + { 0x1ECA, 0x1ECB }, // LATIN CAPITAL LETTER I WITH DOT BELOW + { 0x1ECC, 0x1ECD }, // LATIN CAPITAL LETTER O WITH DOT BELOW + { 0x1ECE, 0x1ECF }, // LATIN CAPITAL LETTER O WITH HOOK ABOVE + { 0x1ED0, 0x1ED1 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE + { 0x1ED2, 0x1ED3 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE + { 0x1ED4, 0x1ED5 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE + { 0x1ED6, 0x1ED7 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE + { 0x1ED8, 0x1ED9 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW + { 0x1EDA, 0x1EDB }, // LATIN CAPITAL LETTER O WITH HORN AND ACUTE + { 0x1EDC, 0x1EDD }, // LATIN CAPITAL LETTER O WITH HORN AND GRAVE + { 0x1EDE, 0x1EDF }, // LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE + { 0x1EE0, 0x1EE1 }, // LATIN CAPITAL LETTER O WITH HORN AND TILDE + { 0x1EE2, 0x1EE3 }, // LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW + { 0x1EE4, 0x1EE5 }, // LATIN CAPITAL LETTER U WITH DOT BELOW + { 0x1EE6, 0x1EE7 }, // LATIN CAPITAL LETTER U WITH HOOK ABOVE + { 0x1EE8, 0x1EE9 }, // LATIN CAPITAL LETTER U WITH HORN AND ACUTE + { 0x1EEA, 0x1EEB }, // LATIN CAPITAL LETTER U WITH HORN AND GRAVE + { 0x1EEC, 0x1EED }, // LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE + { 0x1EEE, 0x1EEF }, // LATIN CAPITAL LETTER U WITH HORN AND TILDE + { 0x1EF0, 0x1EF1 }, // LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW + { 0x1EF2, 0x1EF3 }, // LATIN CAPITAL LETTER Y WITH GRAVE + { 0x1EF4, 0x1EF5 }, // LATIN CAPITAL LETTER Y WITH DOT BELOW + { 0x1EF6, 0x1EF7 }, // LATIN CAPITAL LETTER Y WITH HOOK ABOVE + { 0x1EF8, 0x1EF9 }, // LATIN CAPITAL LETTER Y WITH TILDE + { 0x1EFA, 0x1EFB }, // LATIN CAPITAL LETTER MIDDLE-WELSH LL + { 0x1EFC, 0x1EFD }, // LATIN CAPITAL LETTER MIDDLE-WELSH V + { 0x1EFE, 0x1EFF }, // LATIN CAPITAL LETTER Y WITH LOOP + { 0x1F08, 0x1F00 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI + { 0x1F09, 0x1F01 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA + { 0x1F0A, 0x1F02 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA + { 0x1F0B, 0x1F03 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA + { 0x1F0C, 0x1F04 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA + { 0x1F0D, 0x1F05 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA + { 0x1F0E, 0x1F06 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI + { 0x1F0F, 0x1F07 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI + { 0x1F18, 0x1F10 }, // GREEK CAPITAL LETTER EPSILON WITH PSILI + { 0x1F19, 0x1F11 }, // GREEK CAPITAL LETTER EPSILON WITH DASIA + { 0x1F1A, 0x1F12 }, // GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA + { 0x1F1B, 0x1F13 }, // GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA + { 0x1F1C, 0x1F14 }, // GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA + { 0x1F1D, 0x1F15 }, // GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA + { 0x1F28, 0x1F20 }, // GREEK CAPITAL LETTER ETA WITH PSILI + { 0x1F29, 0x1F21 }, // GREEK CAPITAL LETTER ETA WITH DASIA + { 0x1F2A, 0x1F22 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA + { 0x1F2B, 0x1F23 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA + { 0x1F2C, 0x1F24 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA + { 0x1F2D, 0x1F25 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA + { 0x1F2E, 0x1F26 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI + { 0x1F2F, 0x1F27 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI + { 0x1F38, 0x1F30 }, // GREEK CAPITAL LETTER IOTA WITH PSILI + { 0x1F39, 0x1F31 }, // GREEK CAPITAL LETTER IOTA WITH DASIA + { 0x1F3A, 0x1F32 }, // GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA + { 0x1F3B, 0x1F33 }, // GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA + { 0x1F3C, 0x1F34 }, // GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA + { 0x1F3D, 0x1F35 }, // GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA + { 0x1F3E, 0x1F36 }, // GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI + { 0x1F3F, 0x1F37 }, // GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI + { 0x1F48, 0x1F40 }, // GREEK CAPITAL LETTER OMICRON WITH PSILI + { 0x1F49, 0x1F41 }, // GREEK CAPITAL LETTER OMICRON WITH DASIA + { 0x1F4A, 0x1F42 }, // GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA + { 0x1F4B, 0x1F43 }, // GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA + { 0x1F4C, 0x1F44 }, // GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA + { 0x1F4D, 0x1F45 }, // GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA + { 0x1F59, 0x1F51 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA + { 0x1F5B, 0x1F53 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA + { 0x1F5D, 0x1F55 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA + { 0x1F5F, 0x1F57 }, // GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI + { 0x1F68, 0x1F60 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI + { 0x1F69, 0x1F61 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA + { 0x1F6A, 0x1F62 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA + { 0x1F6B, 0x1F63 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA + { 0x1F6C, 0x1F64 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA + { 0x1F6D, 0x1F65 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA + { 0x1F6E, 0x1F66 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI + { 0x1F6F, 0x1F67 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI + { 0x1F88, 0x1F80 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI + { 0x1F89, 0x1F81 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI + { 0x1F8A, 0x1F82 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1F8B, 0x1F83 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1F8C, 0x1F84 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1F8D, 0x1F85 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1F8E, 0x1F86 }, // GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1F8F, 0x1F87 }, // GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1F98, 0x1F90 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI + { 0x1F99, 0x1F91 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI + { 0x1F9A, 0x1F92 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1F9B, 0x1F93 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1F9C, 0x1F94 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1F9D, 0x1F95 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1F9E, 0x1F96 }, // GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1F9F, 0x1F97 }, // GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1FA8, 0x1FA0 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI + { 0x1FA9, 0x1FA1 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI + { 0x1FAA, 0x1FA2 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI + { 0x1FAB, 0x1FA3 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI + { 0x1FAC, 0x1FA4 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI + { 0x1FAD, 0x1FA5 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI + { 0x1FAE, 0x1FA6 }, // GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1FAF, 0x1FA7 }, // GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + { 0x1FB8, 0x1FB0 }, // GREEK CAPITAL LETTER ALPHA WITH VRACHY + { 0x1FB9, 0x1FB1 }, // GREEK CAPITAL LETTER ALPHA WITH MACRON + { 0x1FBA, 0x1F70 }, // GREEK CAPITAL LETTER ALPHA WITH VARIA + { 0x1FBB, 0x1F71 }, // GREEK CAPITAL LETTER ALPHA WITH OXIA + { 0x1FBC, 0x1FB3 }, // GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + { 0x1FC8, 0x1F72 }, // GREEK CAPITAL LETTER EPSILON WITH VARIA + { 0x1FC9, 0x1F73 }, // GREEK CAPITAL LETTER EPSILON WITH OXIA + { 0x1FCA, 0x1F74 }, // GREEK CAPITAL LETTER ETA WITH VARIA + { 0x1FCB, 0x1F75 }, // GREEK CAPITAL LETTER ETA WITH OXIA + { 0x1FCC, 0x1FC3 }, // GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + { 0x1FD8, 0x1FD0 }, // GREEK CAPITAL LETTER IOTA WITH VRACHY + { 0x1FD9, 0x1FD1 }, // GREEK CAPITAL LETTER IOTA WITH MACRON + { 0x1FDA, 0x1F76 }, // GREEK CAPITAL LETTER IOTA WITH VARIA + { 0x1FDB, 0x1F77 }, // GREEK CAPITAL LETTER IOTA WITH OXIA + { 0x1FE8, 0x1FE0 }, // GREEK CAPITAL LETTER UPSILON WITH VRACHY + { 0x1FE9, 0x1FE1 }, // GREEK CAPITAL LETTER UPSILON WITH MACRON + { 0x1FEA, 0x1F7A }, // GREEK CAPITAL LETTER UPSILON WITH VARIA + { 0x1FEB, 0x1F7B }, // GREEK CAPITAL LETTER UPSILON WITH OXIA + { 0x1FEC, 0x1FE5 }, // GREEK CAPITAL LETTER RHO WITH DASIA + { 0x1FF8, 0x1F78 }, // GREEK CAPITAL LETTER OMICRON WITH VARIA + { 0x1FF9, 0x1F79 }, // GREEK CAPITAL LETTER OMICRON WITH OXIA + { 0x1FFA, 0x1F7C }, // GREEK CAPITAL LETTER OMEGA WITH VARIA + { 0x1FFB, 0x1F7D }, // GREEK CAPITAL LETTER OMEGA WITH OXIA + { 0x1FFC, 0x1FF3 }, // GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + { 0x2126, 0x03C9 }, // OHM SIGN + { 0x212A, 0x006B }, // KELVIN SIGN + { 0x212B, 0x00E5 }, // ANGSTROM SIGN + { 0x2132, 0x214E }, // TURNED CAPITAL F + { 0x2160, 0x2170 }, // ROMAN NUMERAL ONE + { 0x2161, 0x2171 }, // ROMAN NUMERAL TWO + { 0x2162, 0x2172 }, // ROMAN NUMERAL THREE + { 0x2163, 0x2173 }, // ROMAN NUMERAL FOUR + { 0x2164, 0x2174 }, // ROMAN NUMERAL FIVE + { 0x2165, 0x2175 }, // ROMAN NUMERAL SIX + { 0x2166, 0x2176 }, // ROMAN NUMERAL SEVEN + { 0x2167, 0x2177 }, // ROMAN NUMERAL EIGHT + { 0x2168, 0x2178 }, // ROMAN NUMERAL NINE + { 0x2169, 0x2179 }, // ROMAN NUMERAL TEN + { 0x216A, 0x217A }, // ROMAN NUMERAL ELEVEN + { 0x216B, 0x217B }, // ROMAN NUMERAL TWELVE + { 0x216C, 0x217C }, // ROMAN NUMERAL FIFTY + { 0x216D, 0x217D }, // ROMAN NUMERAL ONE HUNDRED + { 0x216E, 0x217E }, // ROMAN NUMERAL FIVE HUNDRED + { 0x216F, 0x217F }, // ROMAN NUMERAL ONE THOUSAND + { 0x2183, 0x2184 }, // ROMAN NUMERAL REVERSED ONE HUNDRED + { 0x24B6, 0x24D0 }, // CIRCLED LATIN CAPITAL LETTER A + { 0x24B7, 0x24D1 }, // CIRCLED LATIN CAPITAL LETTER B + { 0x24B8, 0x24D2 }, // CIRCLED LATIN CAPITAL LETTER C + { 0x24B9, 0x24D3 }, // CIRCLED LATIN CAPITAL LETTER D + { 0x24BA, 0x24D4 }, // CIRCLED LATIN CAPITAL LETTER E + { 0x24BB, 0x24D5 }, // CIRCLED LATIN CAPITAL LETTER F + { 0x24BC, 0x24D6 }, // CIRCLED LATIN CAPITAL LETTER G + { 0x24BD, 0x24D7 }, // CIRCLED LATIN CAPITAL LETTER H + { 0x24BE, 0x24D8 }, // CIRCLED LATIN CAPITAL LETTER I + { 0x24BF, 0x24D9 }, // CIRCLED LATIN CAPITAL LETTER J + { 0x24C0, 0x24DA }, // CIRCLED LATIN CAPITAL LETTER K + { 0x24C1, 0x24DB }, // CIRCLED LATIN CAPITAL LETTER L + { 0x24C2, 0x24DC }, // CIRCLED LATIN CAPITAL LETTER M + { 0x24C3, 0x24DD }, // CIRCLED LATIN CAPITAL LETTER N + { 0x24C4, 0x24DE }, // CIRCLED LATIN CAPITAL LETTER O + { 0x24C5, 0x24DF }, // CIRCLED LATIN CAPITAL LETTER P + { 0x24C6, 0x24E0 }, // CIRCLED LATIN CAPITAL LETTER Q + { 0x24C7, 0x24E1 }, // CIRCLED LATIN CAPITAL LETTER R + { 0x24C8, 0x24E2 }, // CIRCLED LATIN CAPITAL LETTER S + { 0x24C9, 0x24E3 }, // CIRCLED LATIN CAPITAL LETTER T + { 0x24CA, 0x24E4 }, // CIRCLED LATIN CAPITAL LETTER U + { 0x24CB, 0x24E5 }, // CIRCLED LATIN CAPITAL LETTER V + { 0x24CC, 0x24E6 }, // CIRCLED LATIN CAPITAL LETTER W + { 0x24CD, 0x24E7 }, // CIRCLED LATIN CAPITAL LETTER X + { 0x24CE, 0x24E8 }, // CIRCLED LATIN CAPITAL LETTER Y + { 0x24CF, 0x24E9 }, // CIRCLED LATIN CAPITAL LETTER Z + { 0x2C00, 0x2C30 }, // GLAGOLITIC CAPITAL LETTER AZU + { 0x2C01, 0x2C31 }, // GLAGOLITIC CAPITAL LETTER BUKY + { 0x2C02, 0x2C32 }, // GLAGOLITIC CAPITAL LETTER VEDE + { 0x2C03, 0x2C33 }, // GLAGOLITIC CAPITAL LETTER GLAGOLI + { 0x2C04, 0x2C34 }, // GLAGOLITIC CAPITAL LETTER DOBRO + { 0x2C05, 0x2C35 }, // GLAGOLITIC CAPITAL LETTER YESTU + { 0x2C06, 0x2C36 }, // GLAGOLITIC CAPITAL LETTER ZHIVETE + { 0x2C07, 0x2C37 }, // GLAGOLITIC CAPITAL LETTER DZELO + { 0x2C08, 0x2C38 }, // GLAGOLITIC CAPITAL LETTER ZEMLJA + { 0x2C09, 0x2C39 }, // GLAGOLITIC CAPITAL LETTER IZHE + { 0x2C0A, 0x2C3A }, // GLAGOLITIC CAPITAL LETTER INITIAL IZHE + { 0x2C0B, 0x2C3B }, // GLAGOLITIC CAPITAL LETTER I + { 0x2C0C, 0x2C3C }, // GLAGOLITIC CAPITAL LETTER DJERVI + { 0x2C0D, 0x2C3D }, // GLAGOLITIC CAPITAL LETTER KAKO + { 0x2C0E, 0x2C3E }, // GLAGOLITIC CAPITAL LETTER LJUDIJE + { 0x2C0F, 0x2C3F }, // GLAGOLITIC CAPITAL LETTER MYSLITE + { 0x2C10, 0x2C40 }, // GLAGOLITIC CAPITAL LETTER NASHI + { 0x2C11, 0x2C41 }, // GLAGOLITIC CAPITAL LETTER ONU + { 0x2C12, 0x2C42 }, // GLAGOLITIC CAPITAL LETTER POKOJI + { 0x2C13, 0x2C43 }, // GLAGOLITIC CAPITAL LETTER RITSI + { 0x2C14, 0x2C44 }, // GLAGOLITIC CAPITAL LETTER SLOVO + { 0x2C15, 0x2C45 }, // GLAGOLITIC CAPITAL LETTER TVRIDO + { 0x2C16, 0x2C46 }, // GLAGOLITIC CAPITAL LETTER UKU + { 0x2C17, 0x2C47 }, // GLAGOLITIC CAPITAL LETTER FRITU + { 0x2C18, 0x2C48 }, // GLAGOLITIC CAPITAL LETTER HERU + { 0x2C19, 0x2C49 }, // GLAGOLITIC CAPITAL LETTER OTU + { 0x2C1A, 0x2C4A }, // GLAGOLITIC CAPITAL LETTER PE + { 0x2C1B, 0x2C4B }, // GLAGOLITIC CAPITAL LETTER SHTA + { 0x2C1C, 0x2C4C }, // GLAGOLITIC CAPITAL LETTER TSI + { 0x2C1D, 0x2C4D }, // GLAGOLITIC CAPITAL LETTER CHRIVI + { 0x2C1E, 0x2C4E }, // GLAGOLITIC CAPITAL LETTER SHA + { 0x2C1F, 0x2C4F }, // GLAGOLITIC CAPITAL LETTER YERU + { 0x2C20, 0x2C50 }, // GLAGOLITIC CAPITAL LETTER YERI + { 0x2C21, 0x2C51 }, // GLAGOLITIC CAPITAL LETTER YATI + { 0x2C22, 0x2C52 }, // GLAGOLITIC CAPITAL LETTER SPIDERY HA + { 0x2C23, 0x2C53 }, // GLAGOLITIC CAPITAL LETTER YU + { 0x2C24, 0x2C54 }, // GLAGOLITIC CAPITAL LETTER SMALL YUS + { 0x2C25, 0x2C55 }, // GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL + { 0x2C26, 0x2C56 }, // GLAGOLITIC CAPITAL LETTER YO + { 0x2C27, 0x2C57 }, // GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS + { 0x2C28, 0x2C58 }, // GLAGOLITIC CAPITAL LETTER BIG YUS + { 0x2C29, 0x2C59 }, // GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS + { 0x2C2A, 0x2C5A }, // GLAGOLITIC CAPITAL LETTER FITA + { 0x2C2B, 0x2C5B }, // GLAGOLITIC CAPITAL LETTER IZHITSA + { 0x2C2C, 0x2C5C }, // GLAGOLITIC CAPITAL LETTER SHTAPIC + { 0x2C2D, 0x2C5D }, // GLAGOLITIC CAPITAL LETTER TROKUTASTI A + { 0x2C2E, 0x2C5E }, // GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE + { 0x2C60, 0x2C61 }, // LATIN CAPITAL LETTER L WITH DOUBLE BAR + { 0x2C62, 0x026B }, // LATIN CAPITAL LETTER L WITH MIDDLE TILDE + { 0x2C63, 0x1D7D }, // LATIN CAPITAL LETTER P WITH STROKE + { 0x2C64, 0x027D }, // LATIN CAPITAL LETTER R WITH TAIL + { 0x2C67, 0x2C68 }, // LATIN CAPITAL LETTER H WITH DESCENDER + { 0x2C69, 0x2C6A }, // LATIN CAPITAL LETTER K WITH DESCENDER + { 0x2C6B, 0x2C6C }, // LATIN CAPITAL LETTER Z WITH DESCENDER + { 0x2C6D, 0x0251 }, // LATIN CAPITAL LETTER ALPHA + { 0x2C6E, 0x0271 }, // LATIN CAPITAL LETTER M WITH HOOK + { 0x2C6F, 0x0250 }, // LATIN CAPITAL LETTER TURNED A + { 0x2C70, 0x0252 }, // LATIN CAPITAL LETTER TURNED ALPHA + { 0x2C72, 0x2C73 }, // LATIN CAPITAL LETTER W WITH HOOK + { 0x2C75, 0x2C76 }, // LATIN CAPITAL LETTER HALF H + { 0x2C7E, 0x023F }, // LATIN CAPITAL LETTER S WITH SWASH TAIL + { 0x2C7F, 0x0240 }, // LATIN CAPITAL LETTER Z WITH SWASH TAIL + { 0x2C80, 0x2C81 }, // COPTIC CAPITAL LETTER ALFA + { 0x2C82, 0x2C83 }, // COPTIC CAPITAL LETTER VIDA + { 0x2C84, 0x2C85 }, // COPTIC CAPITAL LETTER GAMMA + { 0x2C86, 0x2C87 }, // COPTIC CAPITAL LETTER DALDA + { 0x2C88, 0x2C89 }, // COPTIC CAPITAL LETTER EIE + { 0x2C8A, 0x2C8B }, // COPTIC CAPITAL LETTER SOU + { 0x2C8C, 0x2C8D }, // COPTIC CAPITAL LETTER ZATA + { 0x2C8E, 0x2C8F }, // COPTIC CAPITAL LETTER HATE + { 0x2C90, 0x2C91 }, // COPTIC CAPITAL LETTER THETHE + { 0x2C92, 0x2C93 }, // COPTIC CAPITAL LETTER IAUDA + { 0x2C94, 0x2C95 }, // COPTIC CAPITAL LETTER KAPA + { 0x2C96, 0x2C97 }, // COPTIC CAPITAL LETTER LAULA + { 0x2C98, 0x2C99 }, // COPTIC CAPITAL LETTER MI + { 0x2C9A, 0x2C9B }, // COPTIC CAPITAL LETTER NI + { 0x2C9C, 0x2C9D }, // COPTIC CAPITAL LETTER KSI + { 0x2C9E, 0x2C9F }, // COPTIC CAPITAL LETTER O + { 0x2CA0, 0x2CA1 }, // COPTIC CAPITAL LETTER PI + { 0x2CA2, 0x2CA3 }, // COPTIC CAPITAL LETTER RO + { 0x2CA4, 0x2CA5 }, // COPTIC CAPITAL LETTER SIMA + { 0x2CA6, 0x2CA7 }, // COPTIC CAPITAL LETTER TAU + { 0x2CA8, 0x2CA9 }, // COPTIC CAPITAL LETTER UA + { 0x2CAA, 0x2CAB }, // COPTIC CAPITAL LETTER FI + { 0x2CAC, 0x2CAD }, // COPTIC CAPITAL LETTER KHI + { 0x2CAE, 0x2CAF }, // COPTIC CAPITAL LETTER PSI + { 0x2CB0, 0x2CB1 }, // COPTIC CAPITAL LETTER OOU + { 0x2CB2, 0x2CB3 }, // COPTIC CAPITAL LETTER DIALECT-P ALEF + { 0x2CB4, 0x2CB5 }, // COPTIC CAPITAL LETTER OLD COPTIC AIN + { 0x2CB6, 0x2CB7 }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE + { 0x2CB8, 0x2CB9 }, // COPTIC CAPITAL LETTER DIALECT-P KAPA + { 0x2CBA, 0x2CBB }, // COPTIC CAPITAL LETTER DIALECT-P NI + { 0x2CBC, 0x2CBD }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI + { 0x2CBE, 0x2CBF }, // COPTIC CAPITAL LETTER OLD COPTIC OOU + { 0x2CC0, 0x2CC1 }, // COPTIC CAPITAL LETTER SAMPI + { 0x2CC2, 0x2CC3 }, // COPTIC CAPITAL LETTER CROSSED SHEI + { 0x2CC4, 0x2CC5 }, // COPTIC CAPITAL LETTER OLD COPTIC SHEI + { 0x2CC6, 0x2CC7 }, // COPTIC CAPITAL LETTER OLD COPTIC ESH + { 0x2CC8, 0x2CC9 }, // COPTIC CAPITAL LETTER AKHMIMIC KHEI + { 0x2CCA, 0x2CCB }, // COPTIC CAPITAL LETTER DIALECT-P HORI + { 0x2CCC, 0x2CCD }, // COPTIC CAPITAL LETTER OLD COPTIC HORI + { 0x2CCE, 0x2CCF }, // COPTIC CAPITAL LETTER OLD COPTIC HA + { 0x2CD0, 0x2CD1 }, // COPTIC CAPITAL LETTER L-SHAPED HA + { 0x2CD2, 0x2CD3 }, // COPTIC CAPITAL LETTER OLD COPTIC HEI + { 0x2CD4, 0x2CD5 }, // COPTIC CAPITAL LETTER OLD COPTIC HAT + { 0x2CD6, 0x2CD7 }, // COPTIC CAPITAL LETTER OLD COPTIC GANGIA + { 0x2CD8, 0x2CD9 }, // COPTIC CAPITAL LETTER OLD COPTIC DJA + { 0x2CDA, 0x2CDB }, // COPTIC CAPITAL LETTER OLD COPTIC SHIMA + { 0x2CDC, 0x2CDD }, // COPTIC CAPITAL LETTER OLD NUBIAN SHIMA + { 0x2CDE, 0x2CDF }, // COPTIC CAPITAL LETTER OLD NUBIAN NGI + { 0x2CE0, 0x2CE1 }, // COPTIC CAPITAL LETTER OLD NUBIAN NYI + { 0x2CE2, 0x2CE3 }, // COPTIC CAPITAL LETTER OLD NUBIAN WAU + { 0x2CEB, 0x2CEC }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI + { 0x2CED, 0x2CEE }, // COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA + { 0xA640, 0xA641 }, // CYRILLIC CAPITAL LETTER ZEMLYA + { 0xA642, 0xA643 }, // CYRILLIC CAPITAL LETTER DZELO + { 0xA644, 0xA645 }, // CYRILLIC CAPITAL LETTER REVERSED DZE + { 0xA646, 0xA647 }, // CYRILLIC CAPITAL LETTER IOTA + { 0xA648, 0xA649 }, // CYRILLIC CAPITAL LETTER DJERV + { 0xA64A, 0xA64B }, // CYRILLIC CAPITAL LETTER MONOGRAPH UK + { 0xA64C, 0xA64D }, // CYRILLIC CAPITAL LETTER BROAD OMEGA + { 0xA64E, 0xA64F }, // CYRILLIC CAPITAL LETTER NEUTRAL YER + { 0xA650, 0xA651 }, // CYRILLIC CAPITAL LETTER YERU WITH BACK YER + { 0xA652, 0xA653 }, // CYRILLIC CAPITAL LETTER IOTIFIED YAT + { 0xA654, 0xA655 }, // CYRILLIC CAPITAL LETTER REVERSED YU + { 0xA656, 0xA657 }, // CYRILLIC CAPITAL LETTER IOTIFIED A + { 0xA658, 0xA659 }, // CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS + { 0xA65A, 0xA65B }, // CYRILLIC CAPITAL LETTER BLENDED YUS + { 0xA65C, 0xA65D }, // CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS + { 0xA65E, 0xA65F }, // CYRILLIC CAPITAL LETTER YN + { 0xA660, 0xA661 }, // CYRILLIC CAPITAL LETTER REVERSED TSE + { 0xA662, 0xA663 }, // CYRILLIC CAPITAL LETTER SOFT DE + { 0xA664, 0xA665 }, // CYRILLIC CAPITAL LETTER SOFT EL + { 0xA666, 0xA667 }, // CYRILLIC CAPITAL LETTER SOFT EM + { 0xA668, 0xA669 }, // CYRILLIC CAPITAL LETTER MONOCULAR O + { 0xA66A, 0xA66B }, // CYRILLIC CAPITAL LETTER BINOCULAR O + { 0xA66C, 0xA66D }, // CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O + { 0xA680, 0xA681 }, // CYRILLIC CAPITAL LETTER DWE + { 0xA682, 0xA683 }, // CYRILLIC CAPITAL LETTER DZWE + { 0xA684, 0xA685 }, // CYRILLIC CAPITAL LETTER ZHWE + { 0xA686, 0xA687 }, // CYRILLIC CAPITAL LETTER CCHE + { 0xA688, 0xA689 }, // CYRILLIC CAPITAL LETTER DZZE + { 0xA68A, 0xA68B }, // CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK + { 0xA68C, 0xA68D }, // CYRILLIC CAPITAL LETTER TWE + { 0xA68E, 0xA68F }, // CYRILLIC CAPITAL LETTER TSWE + { 0xA690, 0xA691 }, // CYRILLIC CAPITAL LETTER TSSE + { 0xA692, 0xA693 }, // CYRILLIC CAPITAL LETTER TCHE + { 0xA694, 0xA695 }, // CYRILLIC CAPITAL LETTER HWE + { 0xA696, 0xA697 }, // CYRILLIC CAPITAL LETTER SHWE + { 0xA722, 0xA723 }, // LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF + { 0xA724, 0xA725 }, // LATIN CAPITAL LETTER EGYPTOLOGICAL AIN + { 0xA726, 0xA727 }, // LATIN CAPITAL LETTER HENG + { 0xA728, 0xA729 }, // LATIN CAPITAL LETTER TZ + { 0xA72A, 0xA72B }, // LATIN CAPITAL LETTER TRESILLO + { 0xA72C, 0xA72D }, // LATIN CAPITAL LETTER CUATRILLO + { 0xA72E, 0xA72F }, // LATIN CAPITAL LETTER CUATRILLO WITH COMMA + { 0xA732, 0xA733 }, // LATIN CAPITAL LETTER AA + { 0xA734, 0xA735 }, // LATIN CAPITAL LETTER AO + { 0xA736, 0xA737 }, // LATIN CAPITAL LETTER AU + { 0xA738, 0xA739 }, // LATIN CAPITAL LETTER AV + { 0xA73A, 0xA73B }, // LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR + { 0xA73C, 0xA73D }, // LATIN CAPITAL LETTER AY + { 0xA73E, 0xA73F }, // LATIN CAPITAL LETTER REVERSED C WITH DOT + { 0xA740, 0xA741 }, // LATIN CAPITAL LETTER K WITH STROKE + { 0xA742, 0xA743 }, // LATIN CAPITAL LETTER K WITH DIAGONAL STROKE + { 0xA744, 0xA745 }, // LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE + { 0xA746, 0xA747 }, // LATIN CAPITAL LETTER BROKEN L + { 0xA748, 0xA749 }, // LATIN CAPITAL LETTER L WITH HIGH STROKE + { 0xA74A, 0xA74B }, // LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY + { 0xA74C, 0xA74D }, // LATIN CAPITAL LETTER O WITH LOOP + { 0xA74E, 0xA74F }, // LATIN CAPITAL LETTER OO + { 0xA750, 0xA751 }, // LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER + { 0xA752, 0xA753 }, // LATIN CAPITAL LETTER P WITH FLOURISH + { 0xA754, 0xA755 }, // LATIN CAPITAL LETTER P WITH SQUIRREL TAIL + { 0xA756, 0xA757 }, // LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER + { 0xA758, 0xA759 }, // LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE + { 0xA75A, 0xA75B }, // LATIN CAPITAL LETTER R ROTUNDA + { 0xA75C, 0xA75D }, // LATIN CAPITAL LETTER RUM ROTUNDA + { 0xA75E, 0xA75F }, // LATIN CAPITAL LETTER V WITH DIAGONAL STROKE + { 0xA760, 0xA761 }, // LATIN CAPITAL LETTER VY + { 0xA762, 0xA763 }, // LATIN CAPITAL LETTER VISIGOTHIC Z + { 0xA764, 0xA765 }, // LATIN CAPITAL LETTER THORN WITH STROKE + { 0xA766, 0xA767 }, // LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER + { 0xA768, 0xA769 }, // LATIN CAPITAL LETTER VEND + { 0xA76A, 0xA76B }, // LATIN CAPITAL LETTER ET + { 0xA76C, 0xA76D }, // LATIN CAPITAL LETTER IS + { 0xA76E, 0xA76F }, // LATIN CAPITAL LETTER CON + { 0xA779, 0xA77A }, // LATIN CAPITAL LETTER INSULAR D + { 0xA77B, 0xA77C }, // LATIN CAPITAL LETTER INSULAR F + { 0xA77D, 0x1D79 }, // LATIN CAPITAL LETTER INSULAR G + { 0xA77E, 0xA77F }, // LATIN CAPITAL LETTER TURNED INSULAR G + { 0xA780, 0xA781 }, // LATIN CAPITAL LETTER TURNED L + { 0xA782, 0xA783 }, // LATIN CAPITAL LETTER INSULAR R + { 0xA784, 0xA785 }, // LATIN CAPITAL LETTER INSULAR S + { 0xA786, 0xA787 }, // LATIN CAPITAL LETTER INSULAR T + { 0xA78B, 0xA78C }, // LATIN CAPITAL LETTER SALTILLO + { 0xA78D, 0x0265 }, // LATIN CAPITAL LETTER TURNED H + { 0xA790, 0xA791 }, // LATIN CAPITAL LETTER N WITH DESCENDER + { 0xA7A0, 0xA7A1 }, // LATIN CAPITAL LETTER G WITH OBLIQUE STROKE + { 0xA7A2, 0xA7A3 }, // LATIN CAPITAL LETTER K WITH OBLIQUE STROKE + { 0xA7A4, 0xA7A5 }, // LATIN CAPITAL LETTER N WITH OBLIQUE STROKE + { 0xA7A6, 0xA7A7 }, // LATIN CAPITAL LETTER R WITH OBLIQUE STROKE + { 0xA7A8, 0xA7A9 }, // LATIN CAPITAL LETTER S WITH OBLIQUE STROKE + { 0xFF21, 0xFF41 }, // FULLWIDTH LATIN CAPITAL LETTER A + { 0xFF22, 0xFF42 }, // FULLWIDTH LATIN CAPITAL LETTER B + { 0xFF23, 0xFF43 }, // FULLWIDTH LATIN CAPITAL LETTER C + { 0xFF24, 0xFF44 }, // FULLWIDTH LATIN CAPITAL LETTER D + { 0xFF25, 0xFF45 }, // FULLWIDTH LATIN CAPITAL LETTER E + { 0xFF26, 0xFF46 }, // FULLWIDTH LATIN CAPITAL LETTER F + { 0xFF27, 0xFF47 }, // FULLWIDTH LATIN CAPITAL LETTER G + { 0xFF28, 0xFF48 }, // FULLWIDTH LATIN CAPITAL LETTER H + { 0xFF29, 0xFF49 }, // FULLWIDTH LATIN CAPITAL LETTER I + { 0xFF2A, 0xFF4A }, // FULLWIDTH LATIN CAPITAL LETTER J + { 0xFF2B, 0xFF4B }, // FULLWIDTH LATIN CAPITAL LETTER K + { 0xFF2C, 0xFF4C }, // FULLWIDTH LATIN CAPITAL LETTER L + { 0xFF2D, 0xFF4D }, // FULLWIDTH LATIN CAPITAL LETTER M + { 0xFF2E, 0xFF4E }, // FULLWIDTH LATIN CAPITAL LETTER N + { 0xFF2F, 0xFF4F }, // FULLWIDTH LATIN CAPITAL LETTER O + { 0xFF30, 0xFF50 }, // FULLWIDTH LATIN CAPITAL LETTER P + { 0xFF31, 0xFF51 }, // FULLWIDTH LATIN CAPITAL LETTER Q + { 0xFF32, 0xFF52 }, // FULLWIDTH LATIN CAPITAL LETTER R + { 0xFF33, 0xFF53 }, // FULLWIDTH LATIN CAPITAL LETTER S + { 0xFF34, 0xFF54 }, // FULLWIDTH LATIN CAPITAL LETTER T + { 0xFF35, 0xFF55 }, // FULLWIDTH LATIN CAPITAL LETTER U + { 0xFF36, 0xFF56 }, // FULLWIDTH LATIN CAPITAL LETTER V + { 0xFF37, 0xFF57 }, // FULLWIDTH LATIN CAPITAL LETTER W + { 0xFF38, 0xFF58 }, // FULLWIDTH LATIN CAPITAL LETTER X + { 0xFF39, 0xFF59 }, // FULLWIDTH LATIN CAPITAL LETTER Y + { 0xFF3A, 0xFF5A } // FULLWIDTH LATIN CAPITAL LETTER Z +}; + +static int compare_pair_capital(const void *a, const void *b) { + return static_cast(*static_cast(a)) + - static_cast((static_cast(b))->capital); +} + +/* static */ int CharUtils::latin_tolower(const int c) { + struct LatinCapitalSmallPair *p = + static_cast(bsearch(&c, SORTED_CHAR_MAP, + NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital)); + return p ? static_cast(p->small) : c; +} + +/* + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, CharUtils::BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + * + * Generated with: + * cat UnicodeData.txt | perl -e 'while (<>) { @foo = split(/;/); $foo[5] =~ s/<.*> //; \ + * $base[hex($foo[0])] = hex($foo[5]);} \ + * for ($i = 0; $i < 0x500; $i += 8) { printf("/" . "* U+%04X *" . "/ ", $i); \ + * for ($j = $i; $j < $i + 8; $j++) { \ + * printf("0x%04X, ", $base[$j] ? $base[$j] : $j)}; print "\n"; }' + */ +/* static */ const unsigned short CharUtils::BASE_CHARS[CharUtils::BASE_CHARS_SIZE] = { + /* U+0000 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + /* U+0008 */ 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, + /* U+0010 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, + /* U+0018 */ 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, + /* U+0020 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + /* U+0028 */ 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, + /* U+0030 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + /* U+0038 */ 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, + /* U+0040 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + /* U+0048 */ 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, + /* U+0050 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + /* U+0058 */ 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, + /* U+0060 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, + /* U+0068 */ 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, + /* U+0070 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, + /* U+0078 */ 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, + /* U+0080 */ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + /* U+0088 */ 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, + /* U+0090 */ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + /* U+0098 */ 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, + /* U+00A0 */ 0x0020, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, + /* U+00A8 */ 0x0020, 0x00A9, 0x0061, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0020, + /* U+00B0 */ 0x00B0, 0x00B1, 0x0032, 0x0033, 0x0020, 0x03BC, 0x00B6, 0x00B7, + /* U+00B8 */ 0x0020, 0x0031, 0x006F, 0x00BB, 0x0031, 0x0031, 0x0033, 0x00BF, + /* U+00C0 */ 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x0041, 0x00C6, 0x0043, + /* U+00C8 */ 0x0045, 0x0045, 0x0045, 0x0045, 0x0049, 0x0049, 0x0049, 0x0049, + /* U+00D0 */ 0x00D0, 0x004E, 0x004F, 0x004F, 0x004F, 0x004F, 0x004F, 0x00D7, + /* U+00D8 */ 0x004F, 0x0055, 0x0055, 0x0055, 0x0055, 0x0059, 0x00DE, 0x0073, + // U+00D8: Manually changed from 00D8 to 004F + // TODO: Check if it's really acceptable to consider Ø a diacritical variant of O + // U+00DF: Manually changed from 00DF to 0073 + /* U+00E0 */ 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x0061, 0x00E6, 0x0063, + /* U+00E8 */ 0x0065, 0x0065, 0x0065, 0x0065, 0x0069, 0x0069, 0x0069, 0x0069, + /* U+00F0 */ 0x00F0, 0x006E, 0x006F, 0x006F, 0x006F, 0x006F, 0x006F, 0x00F7, + /* U+00F8 */ 0x006F, 0x0075, 0x0075, 0x0075, 0x0075, 0x0079, 0x00FE, 0x0079, + // U+00F8: Manually changed from 00F8 to 006F + // TODO: Check if it's really acceptable to consider ø a diacritical variant of o + /* U+0100 */ 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0043, 0x0063, + /* U+0108 */ 0x0043, 0x0063, 0x0043, 0x0063, 0x0043, 0x0063, 0x0044, 0x0064, + /* U+0110 */ 0x0046, 0x0064, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, + // U+0110: Manually changed from 0110 to 0046 + // U+0111: Manually changed from 0111 to 0064 + /* U+0118 */ 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067, + /* U+0120 */ 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0126, 0x0127, + /* U+0128 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, + // U+0131: Manually changed from 0131 to 0049 + /* U+0130 */ 0x0049, 0x0049, 0x0049, 0x0069, 0x004A, 0x006A, 0x004B, 0x006B, + /* U+0138 */ 0x0138, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, 0x006C, 0x004C, + /* U+0140 */ 0x006C, 0x004C, 0x006C, 0x004E, 0x006E, 0x004E, 0x006E, 0x004E, + // U+0141: Manually changed from 0141 to 004C + // U+0142: Manually changed from 0142 to 006C + /* U+0148 */ 0x006E, 0x02BC, 0x014A, 0x014B, 0x004F, 0x006F, 0x004F, 0x006F, + /* U+0150 */ 0x004F, 0x006F, 0x0152, 0x0153, 0x0052, 0x0072, 0x0052, 0x0072, + /* U+0158 */ 0x0052, 0x0072, 0x0053, 0x0073, 0x0053, 0x0073, 0x0053, 0x0073, + /* U+0160 */ 0x0053, 0x0073, 0x0054, 0x0074, 0x0054, 0x0074, 0x0166, 0x0167, + /* U+0168 */ 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, + /* U+0170 */ 0x0055, 0x0075, 0x0055, 0x0075, 0x0057, 0x0077, 0x0059, 0x0079, + /* U+0178 */ 0x0059, 0x005A, 0x007A, 0x005A, 0x007A, 0x005A, 0x007A, 0x0073, + /* U+0180 */ 0x0180, 0x0181, 0x0182, 0x0183, 0x0184, 0x0185, 0x0186, 0x0187, + // TODO: A lot of letters are their own base code points, but for + // some (e.g. U+0180) it doesn't seem right. Ideally each code point should + // be checked individually with all languages it's used in. + /* U+0188 */ 0x0188, 0x0189, 0x018A, 0x018B, 0x018C, 0x018D, 0x018E, 0x018F, + /* U+0190 */ 0x0190, 0x0191, 0x0192, 0x0193, 0x0194, 0x0195, 0x0196, 0x0197, + /* U+0198 */ 0x0198, 0x0199, 0x019A, 0x019B, 0x019C, 0x019D, 0x019E, 0x019F, + /* U+01A0 */ 0x004F, 0x006F, 0x01A2, 0x01A3, 0x01A4, 0x01A5, 0x01A6, 0x01A7, + /* U+01A8 */ 0x01A8, 0x01A9, 0x01AA, 0x01AB, 0x01AC, 0x01AD, 0x01AE, 0x0055, + /* U+01B0 */ 0x0075, 0x01B1, 0x01B2, 0x01B3, 0x01B4, 0x01B5, 0x01B6, 0x01B7, + /* U+01B8 */ 0x01B8, 0x01B9, 0x01BA, 0x01BB, 0x01BC, 0x01BD, 0x01BE, 0x01BF, + /* U+01C0 */ 0x01C0, 0x01C1, 0x01C2, 0x01C3, 0x0044, 0x0044, 0x0064, 0x004C, + /* U+01C8 */ 0x004C, 0x006C, 0x004E, 0x004E, 0x006E, 0x0041, 0x0061, 0x0049, + /* U+01D0 */ 0x0069, 0x004F, 0x006F, 0x0055, 0x0075, 0x0055, 0x0075, 0x0055, + // U+01D5: Manually changed from 00DC to 0055 + // U+01D6: Manually changed from 00FC to 0075 + // U+01D7: Manually changed from 00DC to 0055 + /* U+01D8 */ 0x0075, 0x0055, 0x0075, 0x0055, 0x0075, 0x01DD, 0x0041, 0x0061, + // U+01D8: Manually changed from 00FC to 0075 + // U+01D9: Manually changed from 00DC to 0055 + // U+01DA: Manually changed from 00FC to 0075 + // U+01DB: Manually changed from 00DC to 0055 + // U+01DC: Manually changed from 00FC to 0075 + // U+01DE: Manually changed from 00C4 to 0041 + // U+01DF: Manually changed from 00E4 to 0061 + /* U+01E0 */ 0x0041, 0x0061, 0x00C6, 0x00E6, 0x01E4, 0x01E5, 0x0047, 0x0067, + // U+01E0: Manually changed from 0226 to 0041 + // U+01E1: Manually changed from 0227 to 0061 + /* U+01E8 */ 0x004B, 0x006B, 0x004F, 0x006F, 0x004F, 0x006F, 0x01B7, 0x0292, + // U+01EC: Manually changed from 01EA to 004F + // U+01ED: Manually changed from 01EB to 006F + /* U+01F0 */ 0x006A, 0x0044, 0x0044, 0x0064, 0x0047, 0x0067, 0x01F6, 0x01F7, + /* U+01F8 */ 0x004E, 0x006E, 0x0041, 0x0061, 0x00C6, 0x00E6, 0x004F, 0x006F, + // U+01FA: Manually changed from 00C5 to 0041 + // U+01FB: Manually changed from 00E5 to 0061 + // U+01FE: Manually changed from 00D8 to 004F + // TODO: Check if it's really acceptable to consider Ø a diacritical variant of O + // U+01FF: Manually changed from 00F8 to 006F + // TODO: Check if it's really acceptable to consider ø a diacritical variant of o + /* U+0200 */ 0x0041, 0x0061, 0x0041, 0x0061, 0x0045, 0x0065, 0x0045, 0x0065, + /* U+0208 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x004F, 0x006F, 0x004F, 0x006F, + /* U+0210 */ 0x0052, 0x0072, 0x0052, 0x0072, 0x0055, 0x0075, 0x0055, 0x0075, + /* U+0218 */ 0x0053, 0x0073, 0x0054, 0x0074, 0x021C, 0x021D, 0x0048, 0x0068, + /* U+0220 */ 0x0220, 0x0221, 0x0222, 0x0223, 0x0224, 0x0225, 0x0041, 0x0061, + /* U+0228 */ 0x0045, 0x0065, 0x004F, 0x006F, 0x004F, 0x006F, 0x004F, 0x006F, + // U+022A: Manually changed from 00D6 to 004F + // U+022B: Manually changed from 00F6 to 006F + // U+022C: Manually changed from 00D5 to 004F + // U+022D: Manually changed from 00F5 to 006F + /* U+0230 */ 0x004F, 0x006F, 0x0059, 0x0079, 0x0234, 0x0235, 0x0236, 0x0237, + // U+0230: Manually changed from 022E to 004F + // U+0231: Manually changed from 022F to 006F + /* U+0238 */ 0x0238, 0x0239, 0x023A, 0x023B, 0x023C, 0x023D, 0x023E, 0x023F, + /* U+0240 */ 0x0240, 0x0241, 0x0242, 0x0243, 0x0244, 0x0245, 0x0246, 0x0247, + /* U+0248 */ 0x0248, 0x0249, 0x024A, 0x024B, 0x024C, 0x024D, 0x024E, 0x024F, + /* U+0250 */ 0x0250, 0x0251, 0x0252, 0x0253, 0x0254, 0x0255, 0x0256, 0x0257, + /* U+0258 */ 0x0258, 0x0259, 0x025A, 0x025B, 0x025C, 0x025D, 0x025E, 0x025F, + /* U+0260 */ 0x0260, 0x0261, 0x0262, 0x0263, 0x0264, 0x0265, 0x0266, 0x0267, + /* U+0268 */ 0x0268, 0x0269, 0x026A, 0x026B, 0x026C, 0x026D, 0x026E, 0x026F, + /* U+0270 */ 0x0270, 0x0271, 0x0272, 0x0273, 0x0274, 0x0275, 0x0276, 0x0277, + /* U+0278 */ 0x0278, 0x0279, 0x027A, 0x027B, 0x027C, 0x027D, 0x027E, 0x027F, + /* U+0280 */ 0x0280, 0x0281, 0x0282, 0x0283, 0x0284, 0x0285, 0x0286, 0x0287, + /* U+0288 */ 0x0288, 0x0289, 0x028A, 0x028B, 0x028C, 0x028D, 0x028E, 0x028F, + /* U+0290 */ 0x0290, 0x0291, 0x0292, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297, + /* U+0298 */ 0x0298, 0x0299, 0x029A, 0x029B, 0x029C, 0x029D, 0x029E, 0x029F, + /* U+02A0 */ 0x02A0, 0x02A1, 0x02A2, 0x02A3, 0x02A4, 0x02A5, 0x02A6, 0x02A7, + /* U+02A8 */ 0x02A8, 0x02A9, 0x02AA, 0x02AB, 0x02AC, 0x02AD, 0x02AE, 0x02AF, + /* U+02B0 */ 0x0068, 0x0266, 0x006A, 0x0072, 0x0279, 0x027B, 0x0281, 0x0077, + /* U+02B8 */ 0x0079, 0x02B9, 0x02BA, 0x02BB, 0x02BC, 0x02BD, 0x02BE, 0x02BF, + /* U+02C0 */ 0x02C0, 0x02C1, 0x02C2, 0x02C3, 0x02C4, 0x02C5, 0x02C6, 0x02C7, + /* U+02C8 */ 0x02C8, 0x02C9, 0x02CA, 0x02CB, 0x02CC, 0x02CD, 0x02CE, 0x02CF, + /* U+02D0 */ 0x02D0, 0x02D1, 0x02D2, 0x02D3, 0x02D4, 0x02D5, 0x02D6, 0x02D7, + /* U+02D8 */ 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x02DE, 0x02DF, + /* U+02E0 */ 0x0263, 0x006C, 0x0073, 0x0078, 0x0295, 0x02E5, 0x02E6, 0x02E7, + /* U+02E8 */ 0x02E8, 0x02E9, 0x02EA, 0x02EB, 0x02EC, 0x02ED, 0x02EE, 0x02EF, + /* U+02F0 */ 0x02F0, 0x02F1, 0x02F2, 0x02F3, 0x02F4, 0x02F5, 0x02F6, 0x02F7, + /* U+02F8 */ 0x02F8, 0x02F9, 0x02FA, 0x02FB, 0x02FC, 0x02FD, 0x02FE, 0x02FF, + /* U+0300 */ 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, + /* U+0308 */ 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, 0x030D, 0x030E, 0x030F, + /* U+0310 */ 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, + /* U+0318 */ 0x0318, 0x0319, 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F, + /* U+0320 */ 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, + /* U+0328 */ 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F, + /* U+0330 */ 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, + /* U+0338 */ 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F, + /* U+0340 */ 0x0300, 0x0301, 0x0342, 0x0313, 0x0308, 0x0345, 0x0346, 0x0347, + /* U+0348 */ 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, 0x034E, 0x034F, + /* U+0350 */ 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, + /* U+0358 */ 0x0358, 0x0359, 0x035A, 0x035B, 0x035C, 0x035D, 0x035E, 0x035F, + /* U+0360 */ 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + /* U+0368 */ 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F, + /* U+0370 */ 0x0370, 0x0371, 0x0372, 0x0373, 0x02B9, 0x0375, 0x0376, 0x0377, + /* U+0378 */ 0x0378, 0x0379, 0x0020, 0x037B, 0x037C, 0x037D, 0x003B, 0x037F, + /* U+0380 */ 0x0380, 0x0381, 0x0382, 0x0383, 0x0020, 0x00A8, 0x0391, 0x00B7, + /* U+0388 */ 0x0395, 0x0397, 0x0399, 0x038B, 0x039F, 0x038D, 0x03A5, 0x03A9, + /* U+0390 */ 0x03CA, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + /* U+0398 */ 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F, + /* U+03A0 */ 0x03A0, 0x03A1, 0x03A2, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, + /* U+03A8 */ 0x03A8, 0x03A9, 0x0399, 0x03A5, 0x03B1, 0x03B5, 0x03B7, 0x03B9, + /* U+03B0 */ 0x03CB, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, + /* U+03B8 */ 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, + /* U+03C0 */ 0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, + /* U+03C8 */ 0x03C8, 0x03C9, 0x03B9, 0x03C5, 0x03BF, 0x03C5, 0x03C9, 0x03CF, + /* U+03D0 */ 0x03B2, 0x03B8, 0x03A5, 0x03D2, 0x03D2, 0x03C6, 0x03C0, 0x03D7, + /* U+03D8 */ 0x03D8, 0x03D9, 0x03DA, 0x03DB, 0x03DC, 0x03DD, 0x03DE, 0x03DF, + /* U+03E0 */ 0x03E0, 0x03E1, 0x03E2, 0x03E3, 0x03E4, 0x03E5, 0x03E6, 0x03E7, + /* U+03E8 */ 0x03E8, 0x03E9, 0x03EA, 0x03EB, 0x03EC, 0x03ED, 0x03EE, 0x03EF, + /* U+03F0 */ 0x03BA, 0x03C1, 0x03C2, 0x03F3, 0x0398, 0x03B5, 0x03F6, 0x03F7, + /* U+03F8 */ 0x03F8, 0x03A3, 0x03FA, 0x03FB, 0x03FC, 0x03FD, 0x03FE, 0x03FF, + /* U+0400 */ 0x0415, 0x0415, 0x0402, 0x0413, 0x0404, 0x0405, 0x0406, 0x0406, + /* U+0408 */ 0x0408, 0x0409, 0x040A, 0x040B, 0x041A, 0x0418, 0x0423, 0x040F, + /* U+0410 */ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + /* U+0418 */ 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F, + // U+0419: Manually changed from 0418 to 0419 + /* U+0420 */ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + /* U+0428 */ 0x0428, 0x0429, 0x042C, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F, + // U+042A: Manually changed from 042A to 042C + /* U+0430 */ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, + /* U+0438 */ 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F, + // U+0439: Manually changed from 0438 to 0439 + /* U+0440 */ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, + /* U+0448 */ 0x0448, 0x0449, 0x044C, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F, + // U+044A: Manually changed from 044A to 044C + /* U+0450 */ 0x0435, 0x0435, 0x0452, 0x0433, 0x0454, 0x0455, 0x0456, 0x0456, + /* U+0458 */ 0x0458, 0x0459, 0x045A, 0x045B, 0x043A, 0x0438, 0x0443, 0x045F, + /* U+0460 */ 0x0460, 0x0461, 0x0462, 0x0463, 0x0464, 0x0465, 0x0466, 0x0467, + /* U+0468 */ 0x0468, 0x0469, 0x046A, 0x046B, 0x046C, 0x046D, 0x046E, 0x046F, + /* U+0470 */ 0x0470, 0x0471, 0x0472, 0x0473, 0x0474, 0x0475, 0x0474, 0x0475, + /* U+0478 */ 0x0478, 0x0479, 0x047A, 0x047B, 0x047C, 0x047D, 0x047E, 0x047F, + /* U+0480 */ 0x0480, 0x0481, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + /* U+0488 */ 0x0488, 0x0489, 0x048A, 0x048B, 0x048C, 0x048D, 0x048E, 0x048F, + /* U+0490 */ 0x0490, 0x0491, 0x0492, 0x0493, 0x0494, 0x0495, 0x0496, 0x0497, + /* U+0498 */ 0x0498, 0x0499, 0x049A, 0x049B, 0x049C, 0x049D, 0x049E, 0x049F, + /* U+04A0 */ 0x04A0, 0x04A1, 0x04A2, 0x04A3, 0x04A4, 0x04A5, 0x04A6, 0x04A7, + /* U+04A8 */ 0x04A8, 0x04A9, 0x04AA, 0x04AB, 0x04AC, 0x04AD, 0x04AE, 0x04AF, + /* U+04B0 */ 0x04B0, 0x04B1, 0x04B2, 0x04B3, 0x04B4, 0x04B5, 0x04B6, 0x04B7, + /* U+04B8 */ 0x04B8, 0x04B9, 0x04BA, 0x04BB, 0x04BC, 0x04BD, 0x04BE, 0x04BF, + /* U+04C0 */ 0x04C0, 0x0416, 0x0436, 0x04C3, 0x04C4, 0x04C5, 0x04C6, 0x04C7, + /* U+04C8 */ 0x04C8, 0x04C9, 0x04CA, 0x04CB, 0x04CC, 0x04CD, 0x04CE, 0x04CF, + /* U+04D0 */ 0x0410, 0x0430, 0x0410, 0x0430, 0x04D4, 0x04D5, 0x0415, 0x0435, + /* U+04D8 */ 0x04D8, 0x04D9, 0x04D8, 0x04D9, 0x0416, 0x0436, 0x0417, 0x0437, + /* U+04E0 */ 0x04E0, 0x04E1, 0x0418, 0x0438, 0x0418, 0x0438, 0x041E, 0x043E, + /* U+04E8 */ 0x04E8, 0x04E9, 0x04E8, 0x04E9, 0x042D, 0x044D, 0x0423, 0x0443, + /* U+04F0 */ 0x0423, 0x0443, 0x0423, 0x0443, 0x0427, 0x0447, 0x04F6, 0x04F7, + /* U+04F8 */ 0x042B, 0x044B, 0x04FA, 0x04FB, 0x04FC, 0x04FD, 0x04FE, 0x04FF, +}; + +/* static */ const std::vector CharUtils::EMPTY_STRING(1 /* size */, '\0' /* value */); +} // namespace latinime diff --git a/app/src/main/jni/src/utils/char_utils.h b/app/src/main/jni/src/utils/char_utils.h new file mode 100644 index 000000000..7871c26ef --- /dev/null +++ b/app/src/main/jni/src/utils/char_utils.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_CHAR_UTILS_H +#define LATINIME_CHAR_UTILS_H + +#include +#include +#include + +#include "defines.h" + +namespace latinime { + +class CharUtils { + public: + static const std::vector EMPTY_STRING; + + static AK_FORCE_INLINE bool isAsciiUpper(int c) { + // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to + // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). + return (c >= 'A' && c <= 'Z'); + } + + static AK_FORCE_INLINE int toLowerCase(const int c) { + if (isAsciiUpper(c)) { + return toAsciiLower(c); + } + if (isAscii(c)) { + return c; + } + return latin_tolower(c); + } + + static AK_FORCE_INLINE int toBaseLowerCase(const int c) { + return toLowerCase(toBaseCodePoint(c)); + } + + static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) { + // TODO: Do not hardcode here + return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; + } + static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { + int size = 0; + for (; size < arraySize; ++size) { + if (codePoints[size] == '\0') { + break; + } + } + return size; + } + + static AK_FORCE_INLINE int toBaseCodePoint(int c) { + if (c < BASE_CHARS_SIZE) { + return static_cast(BASE_CHARS[c]); + } + return c; + } + + static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) { + int spaceCount = 0; + for (int i = 0; i < length; ++i) { + if (codePointBuffer[i] == KEYCODE_SPACE) { + ++spaceCount; + } + } + return spaceCount; + } + + static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) { + return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; + } + + // Returns updated code point count. Returns 0 when the code points cannot be marked as a + // Beginning-of-Sentence. + static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, + const int codePointCount, const int maxCodePoint) { + if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { + // Marker has already been attached. + return codePointCount; + } + if (codePointCount >= maxCodePoint) { + // the code points cannot be marked as a Beginning-of-Sentence. + return 0; + } + memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount); + codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE; + return codePointCount + 1; + } + + // Returns updated code point count. + static AK_FORCE_INLINE int removeBeginningOfSentenceMarker(int *const codePoints, + const int codePointCount) { + if (codePointCount <= 0 || codePoints[0] != CODE_POINT_BEGINNING_OF_SENTENCE) { + return codePointCount; + } + const int newCodePointCount = codePointCount - 1; + memmove(codePoints, codePoints + 1, sizeof(int) * newCodePointCount); + return newCodePointCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); + + static const int MIN_UNICODE_CODE_POINT; + static const int MAX_UNICODE_CODE_POINT; + + /** + * Table mapping most combined Latin, Greek, and Cyrillic characters + * to their base characters. If c is in range, BASE_CHARS[c] == c + * if c is not a combined character, or the base character if it + * is combined. + */ + static const int BASE_CHARS_SIZE = 0x0500; + static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; + + static AK_FORCE_INLINE bool isAscii(int c) { + return isascii(c) != 0; + } + + static AK_FORCE_INLINE int toAsciiLower(int c) { + return c - 'A' + 'a'; + } + + static int latin_tolower(const int c); +}; +} // namespace latinime +#endif // LATINIME_CHAR_UTILS_H diff --git a/app/src/main/jni/src/utils/int_array_view.h b/app/src/main/jni/src/utils/int_array_view.h new file mode 100644 index 000000000..e0f671056 --- /dev/null +++ b/app/src/main/jni/src/utils/int_array_view.h @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_INT_ARRAY_VIEW_H +#define LATINIME_INT_ARRAY_VIEW_H + +#include +#include +#include +#include +#include + +#include "defines.h" + +namespace latinime { + +/** + * Helper class used to provide a read-only view of a given range of integer array. This class + * does not take ownership of the underlying integer array but is designed to be a lightweight + * object that obeys value semantics. + * + * Example: + * + * bool constinsX(IntArrayView view) { + * for (size_t i = 0; i < view.size(); ++i) { + * if (view[i] == 'X') { + * return true; + * } + * } + * return false; + * } + * + * const int codePointArray[] = { 'A', 'B', 'X', 'Z' }; + * auto view = IntArrayView(codePointArray, NELEMS(codePointArray)); + * const bool hasX = constinsX(view); + * + */ +class IntArrayView { + public: + IntArrayView() : mPtr(nullptr), mSize(0) {} + + IntArrayView(const int *const ptr, const size_t size) + : mPtr(ptr), mSize(size) {} + + explicit IntArrayView(const std::vector &vector) + : mPtr(vector.data()), mSize(vector.size()) {} + + template + AK_FORCE_INLINE static IntArrayView fromArray(const std::array &array) { + return IntArrayView(array.data(), array.size()); + } + + // Returns a view that points one int object. + AK_FORCE_INLINE static IntArrayView singleElementView(const int *const ptr) { + return IntArrayView(ptr, 1); + } + + AK_FORCE_INLINE int operator[](const size_t index) const { + ASSERT(index < mSize); + return mPtr[index]; + } + + AK_FORCE_INLINE bool empty() const { + return size() == 0; + } + + AK_FORCE_INLINE size_t size() const { + return mSize; + } + + AK_FORCE_INLINE const int *data() const { + return mPtr; + } + + AK_FORCE_INLINE const int *begin() const { + return mPtr; + } + + AK_FORCE_INLINE const int *end() const { + return mPtr + mSize; + } + + AK_FORCE_INLINE bool contains(const int value) const { + return std::find(begin(), end(), value) != end(); + } + + // Returns the view whose size is smaller than or equal to the given count. + AK_FORCE_INLINE const IntArrayView limit(const size_t maxSize) const { + return IntArrayView(mPtr, std::min(maxSize, mSize)); + } + + AK_FORCE_INLINE const IntArrayView skip(const size_t n) const { + if (mSize <= n) { + return IntArrayView(); + } + return IntArrayView(mPtr + n, mSize - n); + } + + template + void copyToArray(std::array *const buffer, const size_t offset) const { + ASSERT(mSize + offset <= N); + memmove(buffer->data() + offset, mPtr, sizeof(int) * mSize); + } + + AK_FORCE_INLINE int firstOrDefault(const int defaultValue) const { + if (empty()) { + return defaultValue; + } + return mPtr[0]; + } + + AK_FORCE_INLINE int lastOrDefault(const int defaultValue) const { + if (empty()) { + return defaultValue; + } + return mPtr[mSize - 1]; + } + + AK_FORCE_INLINE std::vector toVector() const { + return std::vector(begin(), end()); + } + + std::vector split(const int separator, const int limit = S_INT_MAX) const { + if (limit <= 0) { + return std::vector(); + } + std::vector result; + if (limit == 1) { + result.emplace_back(mPtr, mSize); + return result; + } + size_t startIndex = 0; + for (size_t i = 0; i < mSize; ++i) { + if (mPtr[i] == separator) { + result.emplace_back(mPtr + startIndex, i - startIndex); + startIndex = i + 1; + if (result.size() >= static_cast(limit - 1)) { + break; + } + } + } + result.emplace_back(mPtr + startIndex, mSize - startIndex); + return result; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(IntArrayView); + + const int *const mPtr; + const size_t mSize; +}; + +using WordIdArrayView = IntArrayView; +using PtNodePosArrayView = IntArrayView; +using CodePointArrayView = IntArrayView; +template +using WordIdArray = std::array; + +} // namespace latinime +#endif // LATINIME_MEMORY_VIEW_H diff --git a/app/src/main/jni/src/utils/jni_data_utils.cpp b/app/src/main/jni/src/utils/jni_data_utils.cpp new file mode 100644 index 000000000..41f0623d8 --- /dev/null +++ b/app/src/main/jni/src/utils/jni_data_utils.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/jni_data_utils.h" + +#include "utils/int_array_view.h" + +namespace latinime { + +const int JniDataUtils::CODE_POINT_REPLACEMENT_CHARACTER = 0xFFFD; +const int JniDataUtils::CODE_POINT_NULL = 0; + +/* static */ void JniDataUtils::outputWordProperty(JNIEnv *const env, + const WordProperty &wordProperty, jintArray outCodePoints, jbooleanArray outFlags, + jintArray outProbabilityInfo, jobject outNgramPrevWordsArray, + jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets, + jobject outNgramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities) { + const CodePointArrayView codePoints = wordProperty.getCodePoints(); + JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, + MAX_WORD_LENGTH /* maxLength */, codePoints.data(), codePoints.size(), + false /* needsNullTermination */); + const UnigramProperty &unigramProperty = wordProperty.getUnigramProperty(); + const std::vector &ngrams = wordProperty.getNgramProperties(); + jboolean flags[] = {unigramProperty.isNotAWord(), unigramProperty.isPossiblyOffensive(), + !ngrams.empty(), unigramProperty.hasShortcuts(), + unigramProperty.representsBeginningOfSentence()}; + env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); + const HistoricalInfo &historicalInfo = unigramProperty.getHistoricalInfo(); + int probabilityInfo[] = {unigramProperty.getProbability(), historicalInfo.getTimestamp(), + historicalInfo.getLevel(), historicalInfo.getCount()}; + env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo), + probabilityInfo); + + jclass integerClass = env->FindClass("java/lang/Integer"); + jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "", "(I)V"); + jclass arrayListClass = env->FindClass("java/util/ArrayList"); + jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); + + // Output ngrams. + jclass intArrayClass = env->FindClass("[I"); + for (const auto &ngramProperty : ngrams) { + const NgramContext *const ngramContext = ngramProperty.getNgramContext(); + jobjectArray prevWordWordCodePointsArray = env->NewObjectArray( + ngramContext->getPrevWordCount(), intArrayClass, nullptr); + jbooleanArray prevWordIsBeginningOfSentenceArray = + env->NewBooleanArray(ngramContext->getPrevWordCount()); + for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) { + const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1); + jintArray prevWordCodePoints = env->NewIntArray(codePoints.size()); + JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */, + codePoints.size(), codePoints.data(), codePoints.size(), + false /* needsNullTermination */); + env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints); + env->DeleteLocalRef(prevWordCodePoints); + JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i, + ngramContext->isNthPrevWordBeginningOfSentence(i + 1)); + } + env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray); + env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId, + prevWordIsBeginningOfSentenceArray); + env->DeleteLocalRef(prevWordWordCodePointsArray); + env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray); + + const std::vector *const targetWordCodePoints = ngramProperty.getTargetCodePoints(); + jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size()); + JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */, + targetWordCodePoints->size(), targetWordCodePoints->data(), + targetWordCodePoints->size(), false /* needsNullTermination */); + env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray); + env->DeleteLocalRef(targetWordCodePointArray); + + const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo(); + int bigramProbabilityInfo[] = {ngramProperty.getProbability(), + ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(), + ngramHistoricalInfo.getCount()}; + jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); + env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, + NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); + env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray); + env->DeleteLocalRef(bigramProbabilityInfoArray); + } + + // Output shortcuts. + for (const auto &shortcut : unigramProperty.getShortcuts()) { + const std::vector *const targetCodePoints = shortcut.getTargetCodePoints(); + jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size()); + JniDataUtils::outputCodePoints(env, shortcutTargetCodePointArray, 0 /* start */, + targetCodePoints->size(), targetCodePoints->data(), targetCodePoints->size(), + false /* needsNullTermination */); + env->CallBooleanMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); + env->DeleteLocalRef(shortcutTargetCodePointArray); + jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, + shortcut.getProbability()); + env->CallBooleanMethod(outShortcutProbabilities, addMethodId, integerProbability); + env->DeleteLocalRef(integerProbability); + } + env->DeleteLocalRef(integerClass); + env->DeleteLocalRef(arrayListClass); +} + +} // namespace latinime diff --git a/app/src/main/jni/src/utils/jni_data_utils.h b/app/src/main/jni/src/utils/jni_data_utils.h new file mode 100644 index 000000000..8024e34c4 --- /dev/null +++ b/app/src/main/jni/src/utils/jni_data_utils.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_JNI_DATA_UTILS_H +#define LATINIME_JNI_DATA_UTILS_H + +#include + +#include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/word_property.h" +#include "jni.h" +#include "utils/char_utils.h" + +namespace latinime { + +class JniDataUtils { + public: + static void jintarrayToVector(JNIEnv *env, jintArray array, std::vector *const outVector) { + if (!array) { + outVector->clear(); + return; + } + const jsize arrayLength = env->GetArrayLength(array); + outVector->resize(arrayLength); + env->GetIntArrayRegion(array, 0 /* start */, arrayLength, outVector->data()); + } + + static DictionaryHeaderStructurePolicy::AttributeMap constructAttributeMap(JNIEnv *env, + jobjectArray attributeKeyStringArray, jobjectArray attributeValueStringArray) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + const int keyCount = env->GetArrayLength(attributeKeyStringArray); + for (int i = 0; i < keyCount; i++) { + jstring keyString = static_cast( + env->GetObjectArrayElement(attributeKeyStringArray, i)); + const jsize keyUtf8Length = env->GetStringUTFLength(keyString); + char keyChars[keyUtf8Length + 1]; + env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars); + env->DeleteLocalRef(keyString); + keyChars[keyUtf8Length] = '\0'; + DictionaryHeaderStructurePolicy::AttributeMap::key_type key; + HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key); + + jstring valueString = static_cast( + env->GetObjectArrayElement(attributeValueStringArray, i)); + const jsize valueUtf8Length = env->GetStringUTFLength(valueString); + char valueChars[valueUtf8Length + 1]; + env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars); + env->DeleteLocalRef(valueString); + valueChars[valueUtf8Length] = '\0'; + DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value; + HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value); + attributeMap[key] = value; + } + return attributeMap; + } + + static void outputCodePoints(JNIEnv *env, jintArray intArrayToOutputCodePoints, const int start, + const int maxLength, const int *const codePoints, const int codePointCount, + const bool needsNullTermination) { + const int codePointBufSize = std::min(maxLength, codePointCount); + int outputCodePonts[codePointBufSize]; + int outputCodePointCount = 0; + for (int i = 0; i < codePointBufSize; ++i) { + const int codePoint = codePoints[i]; + int codePointToOutput = codePoint; + if (!CharUtils::isInUnicodeSpace(codePoint)) { + if (codePoint == CODE_POINT_BEGINNING_OF_SENTENCE) { + // Just skip Beginning-of-Sentence marker. + continue; + } + codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER; + } else if (codePoint >= 0x01 && codePoint <= 0x1F) { + // Control code. + codePointToOutput = CODE_POINT_REPLACEMENT_CHARACTER; + } + outputCodePonts[outputCodePointCount++] = codePointToOutput; + } + env->SetIntArrayRegion(intArrayToOutputCodePoints, start, outputCodePointCount, + outputCodePonts); + if (needsNullTermination && outputCodePointCount < maxLength) { + env->SetIntArrayRegion(intArrayToOutputCodePoints, start + outputCodePointCount, + 1 /* len */, &CODE_POINT_NULL); + } + } + + static NgramContext constructNgramContext(JNIEnv *env, jobjectArray prevWordCodePointArrays, + jbooleanArray isBeginningOfSentenceArray, const size_t prevWordCount) { + int prevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int prevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool isBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (size_t i = 0; i < prevWordCount; ++i) { + prevWordCodePointCount[i] = 0; + isBeginningOfSentence[i] = false; + jintArray prevWord = (jintArray)env->GetObjectArrayElement(prevWordCodePointArrays, i); + if (!prevWord) { + continue; + } + jsize prevWordLength = env->GetArrayLength(prevWord); + if (prevWordLength > MAX_WORD_LENGTH) { + continue; + } + env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]); + env->DeleteLocalRef(prevWord); + prevWordCodePointCount[i] = prevWordLength; + jboolean isBeginningOfSentenceBoolean = JNI_FALSE; + env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */, + &isBeginningOfSentenceBoolean); + isBeginningOfSentence[i] = isBeginningOfSentenceBoolean == JNI_TRUE; + } + return NgramContext(prevWordCodePoints, prevWordCodePointCount, isBeginningOfSentence, + prevWordCount); + } + + static void putBooleanToArray(JNIEnv *env, jbooleanArray array, const int index, + const jboolean value) { + env->SetBooleanArrayRegion(array, index, 1 /* len */, &value); + } + + static void putIntToArray(JNIEnv *env, jintArray array, const int index, const int value) { + env->SetIntArrayRegion(array, index, 1 /* len */, &value); + } + + static void putFloatToArray(JNIEnv *env, jfloatArray array, const int index, + const float value) { + env->SetFloatArrayRegion(array, index, 1 /* len */, &value); + } + + static void outputWordProperty(JNIEnv *const env, const WordProperty &wordProperty, + jintArray outCodePoints, jbooleanArray outFlags, jintArray outProbabilityInfo, + jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray, + jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(JniDataUtils); + + static const int CODE_POINT_REPLACEMENT_CHARACTER; + static const int CODE_POINT_NULL; +}; +} // namespace latinime +#endif // LATINIME_JNI_DATA_UTILS_H diff --git a/app/src/main/jni/src/utils/log_utils.cpp b/app/src/main/jni/src/utils/log_utils.cpp new file mode 100644 index 000000000..5ab2b2862 --- /dev/null +++ b/app/src/main/jni/src/utils/log_utils.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "log_utils.h" + +#include +#include + +#include "defines.h" + +namespace latinime { + /* static */ void LogUtils::logToJava(JNIEnv *const env, const char *const format, ...) { + static const char *TAG = "LatinIME:LogUtils"; + const jclass androidUtilLogClass = env->FindClass("android/util/Log"); + if (!androidUtilLogClass) { + // If we can't find the class, we are probably in off-device testing, and + // it's expected. Regardless, logging is not essential to functionality, so + // we should just return. However, FindClass has thrown an exception behind + // our back and there is no way to prevent it from doing that, so we clear + // the exception before we return. + env->ExceptionClear(); + return; + } + const jmethodID logDotIMethodId = env->GetStaticMethodID(androidUtilLogClass, "i", + "(Ljava/lang/String;Ljava/lang/String;)I"); + if (!logDotIMethodId) { + env->ExceptionClear(); + if (androidUtilLogClass) env->DeleteLocalRef(androidUtilLogClass); + return; + } + const jstring javaTag = env->NewStringUTF(TAG); + + static const int DEFAULT_LINE_SIZE = 128; + char fixedSizeCString[DEFAULT_LINE_SIZE]; + va_list argList; + va_start(argList, format); + // Get the necessary size. Add 1 for the 0 terminator. + const int size = vsnprintf(fixedSizeCString, DEFAULT_LINE_SIZE, format, argList) + 1; + va_end(argList); + + jstring javaString; + if (size <= DEFAULT_LINE_SIZE) { + // The buffer was large enough. + javaString = env->NewStringUTF(fixedSizeCString); + } else { + // The buffer was not large enough. + va_start(argList, format); + char variableSizeCString[size]; + vsnprintf(variableSizeCString, size, format, argList); + va_end(argList); + javaString = env->NewStringUTF(variableSizeCString); + } + + env->CallStaticIntMethod(androidUtilLogClass, logDotIMethodId, javaTag, javaString); + if (javaString) env->DeleteLocalRef(javaString); + if (javaTag) env->DeleteLocalRef(javaTag); + if (androidUtilLogClass) env->DeleteLocalRef(androidUtilLogClass); + } +} diff --git a/app/src/main/jni/src/utils/log_utils.h b/app/src/main/jni/src/utils/log_utils.h new file mode 100644 index 000000000..6ac16d91a --- /dev/null +++ b/app/src/main/jni/src/utils/log_utils.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LOG_UTILS_H +#define LATINIME_LOG_UTILS_H + +#include "defines.h" +#include "jni.h" + +namespace latinime { + +class LogUtils { + public: + static void logToJava(JNIEnv *const env, const char *const format, ...) +#ifdef __GNUC__ + __attribute__ ((format (printf, 2, 3))) +#endif // __GNUC__ + ; + + private: + DISALLOW_COPY_AND_ASSIGN(LogUtils); +}; +} // namespace latinime +#endif // LATINIME_LOG_UTILS_H diff --git a/app/src/main/jni/src/utils/ngram_utils.h b/app/src/main/jni/src/utils/ngram_utils.h new file mode 100644 index 000000000..fa85ba35f --- /dev/null +++ b/app/src/main/jni/src/utils/ngram_utils.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_UTILS_H +#define LATINIME_NGRAM_UTILS_H + +#include "defines.h" + +namespace latinime { + +enum class NgramType : int { + Unigram = 0, + Bigram = 1, + Trigram = 2, + Quadgram = 3, + NotANgramType = -1, +}; + +namespace AllNgramTypes { +// Use anonymous namespace to avoid ODR (One Definition Rule) violation. +namespace { + +const NgramType ASCENDING[] = { + NgramType::Unigram, NgramType::Bigram, NgramType::Trigram +}; + +const NgramType DESCENDING[] = { + NgramType::Trigram, NgramType::Bigram, NgramType::Unigram +}; + +} // namespace +} // namespace AllNgramTypes + +class NgramUtils final { + public: + static AK_FORCE_INLINE NgramType getNgramTypeFromWordCount(const int wordCount) { + // Max supported ngram is (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram. + if (wordCount <= 0 || wordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1) { + return NgramType::NotANgramType; + } + // Convert word count to 0-origin enum value. + return static_cast(wordCount - 1); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(NgramUtils); + +}; +} +#endif /* LATINIME_NGRAM_UTILS_H */ diff --git a/app/src/main/jni/src/utils/profiler.h b/app/src/main/jni/src/utils/profiler.h new file mode 100644 index 000000000..5f107fed3 --- /dev/null +++ b/app/src/main/jni/src/utils/profiler.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROFILER_H +#define LATINIME_PROFILER_H + +#ifdef FLAG_DO_PROFILE + +#include "defines.h" + +#include +#include + +namespace latinime { + +class Profiler final { + public: + Profiler(const clockid_t clockId) + : mClockId(clockId), mStartTime(getTimeInMicroSec()), mStartTimes(), mTimes(), + mCounters() {} + + ~Profiler() { + const float totalTime = + static_cast(getTimeInMicroSec() - mStartTime) / 1000.f; + AKLOGI("Total time is %6.3f ms.", totalTime); + for (const auto &time : mTimes) { + AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", time.first, + time.second / totalTime * 100.0f, time.second, mCounters[time.first]); + } + } + + void startTimer(const int id) { + mStartTimes[id] = getTimeInMicroSec(); + } + + void endTimer(const int id) { + mTimes[id] += static_cast(getTimeInMicroSec() - mStartTimes[id]) / 1000.0f; + mCounters[id]++; + } + + operator bool() const { return false; } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Profiler); + + const clockid_t mClockId; + int64_t mStartTime; + std::unordered_map mStartTimes; + std::unordered_map mTimes; + std::unordered_map mCounters; + + int64_t getTimeInMicroSec() { + timespec time; + clock_gettime(mClockId, &time); + return static_cast(time.tv_sec) * 1000000 + + static_cast(time.tv_nsec) / 1000; + } +}; +} // namespace latinime + +#define PROF_INIT Profiler __LATINIME__PROFILER__(CLOCK_THREAD_CPUTIME_ID) +#define PROF_TIMER_START(timer_id) __LATINIME__PROFILER__.startTimer(timer_id) +#define PROF_TIMER_END(timer_id) __LATINIME__PROFILER__.endTimer(timer_id) + +#else // FLAG_DO_PROFILE + +#define PROF_INIT +#define PROF_TIMER_START(timer_id) +#define PROF_TIMER_END(timer_id) + +#endif // FLAG_DO_PROFILE + +#endif /* LATINIME_PROFILER_H */ diff --git a/app/src/main/jni/src/utils/time_keeper.cpp b/app/src/main/jni/src/utils/time_keeper.cpp new file mode 100644 index 000000000..026284060 --- /dev/null +++ b/app/src/main/jni/src/utils/time_keeper.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/time_keeper.h" + +#include + +namespace latinime { + +int TimeKeeper::sCurrentTime; +bool TimeKeeper::sSetForTesting; + +/* static */ void TimeKeeper::setCurrentTime() { + if (!sSetForTesting) { + sCurrentTime = time(0); + } +} + +/* static */ void TimeKeeper::startTestModeWithForceCurrentTime(const int currentTime) { + sCurrentTime = currentTime; + sSetForTesting = true; +} + +/* static */ void TimeKeeper::stopTestMode() { + sSetForTesting = false; +} + +} // namespace latinime diff --git a/app/src/main/jni/src/utils/time_keeper.h b/app/src/main/jni/src/utils/time_keeper.h new file mode 100644 index 000000000..d066757e4 --- /dev/null +++ b/app/src/main/jni/src/utils/time_keeper.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TIME_KEEPER_H +#define LATINIME_TIME_KEEPER_H + +#include "defines.h" + +namespace latinime { + +class TimeKeeper { + public: + static void setCurrentTime(); + + static void startTestModeWithForceCurrentTime(const int currentTime); + + static void stopTestMode(); + + static int peekCurrentTime() { return sCurrentTime; }; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TimeKeeper); + + static int sCurrentTime; + static bool sSetForTesting; +}; +} // namespace latinime +#endif /* LATINIME_TIME_KEEPER_H */ diff --git a/app/src/main/jni/tests/defines_test.cpp b/app/src/main/jni/tests/defines_test.cpp new file mode 100644 index 000000000..f7b80b2b5 --- /dev/null +++ b/app/src/main/jni/tests/defines_test.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "defines.h" + +#include + +namespace latinime { +namespace { + +TEST(DefinesTest, NELEMSForFixedLengthArray) { + const size_t SMALL_ARRAY_SIZE = 1; + const size_t LARGE_ARRAY_SIZE = 100; + int smallArray[SMALL_ARRAY_SIZE]; + int largeArray[LARGE_ARRAY_SIZE]; + EXPECT_EQ(SMALL_ARRAY_SIZE, NELEMS(smallArray)); + EXPECT_EQ(LARGE_ARRAY_SIZE, NELEMS(largeArray)); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/header/header_read_write_utils_test.cpp b/app/src/main/jni/tests/dictionary/header/header_read_write_utils_test.cpp new file mode 100644 index 000000000..eab5d6575 --- /dev/null +++ b/app/src/main/jni/tests/dictionary/header/header_read_write_utils_test.cpp @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/header/header_read_write_utils.h" + +#include + +#include +#include + +#include "dictionary/interface/dictionary_header_structure_policy.h" + +namespace latinime { +namespace { + +TEST(HeaderReadWriteUtilsTest, TestInsertCharactersIntoVector) { + DictionaryHeaderStructurePolicy::AttributeMap::key_type vector; + + HeaderReadWriteUtils::insertCharactersIntoVector("", &vector); + EXPECT_TRUE(vector.empty()); + + static const char *str = "abc-xyz!?"; + HeaderReadWriteUtils::insertCharactersIntoVector(str, &vector); + EXPECT_EQ(strlen(str) , vector.size()); + for (size_t i = 0; i < vector.size(); ++i) { + EXPECT_EQ(str[i], vector[i]); + } +} + +TEST(HeaderReadWriteUtilsTest, TestAttributeMapForInt) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + + // Returns default value if not exists. + EXPECT_EQ(-1, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "", -1)); + EXPECT_EQ(100, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); + + HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abc", 10); + EXPECT_EQ(10, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); + HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abc", 20); + EXPECT_EQ(20, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); + HeaderReadWriteUtils::setIntAttribute(&attributeMap, "abcd", 30); + EXPECT_EQ(30, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abcd", 100)); + EXPECT_EQ(20, HeaderReadWriteUtils::readIntAttributeValue(&attributeMap, "abc", 100)); +} + +TEST(HeaderReadWriteUtilsTest, TestAttributeMapCodeForPoints) { + DictionaryHeaderStructurePolicy::AttributeMap attributeMap; + + // Returns empty vector if not exists. + EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue(&attributeMap, "").empty()); + EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue( + &attributeMap, "abc").empty()); + + HeaderReadWriteUtils::setCodePointVectorAttribute(&attributeMap, "abc", {}); + EXPECT_TRUE(HeaderReadWriteUtils::readCodePointVectorAttributeValue( + &attributeMap, "abc").empty()); + + const std::vector codePoints = { 0x0, 0x20, 0x1F, 0x100000 }; + HeaderReadWriteUtils::setCodePointVectorAttribute(&attributeMap, "abc", codePoints); + EXPECT_EQ(codePoints, HeaderReadWriteUtils::readCodePointVectorAttributeValue( + &attributeMap, "abc")); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp b/app/src/main/jni/tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp new file mode 100644 index 000000000..2e3047eda --- /dev/null +++ b/app/src/main/jni/tests/dictionary/structure/v4/content/language_model_dict_content_global_counters_test.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" + +#include + +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { +namespace { + +TEST(LanguageModelDictContentGlobalCountersTest, TestUpdateMaxValueOfCounters) { + LanguageModelDictContentGlobalCounters globalCounters; + + EXPECT_FALSE(globalCounters.needsToHalveCounters()); + globalCounters.updateMaxValueOfCounters(10); + EXPECT_FALSE(globalCounters.needsToHalveCounters()); + const int count = (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 1; + globalCounters.updateMaxValueOfCounters(count); + EXPECT_TRUE(globalCounters.needsToHalveCounters()); + globalCounters.halveCounters(); + EXPECT_FALSE(globalCounters.needsToHalveCounters()); +} + +TEST(LanguageModelDictContentGlobalCountersTest, TestIncrementTotalCount) { + LanguageModelDictContentGlobalCounters globalCounters; + + EXPECT_EQ(0, globalCounters.getTotalCount()); + globalCounters.incrementTotalCount(); + EXPECT_EQ(1, globalCounters.getTotalCount()); + for (int i = 1; i < 50; ++i) { + globalCounters.incrementTotalCount(); + } + EXPECT_EQ(50, globalCounters.getTotalCount()); + globalCounters.halveCounters(); + EXPECT_EQ(25, globalCounters.getTotalCount()); + globalCounters.halveCounters(); + EXPECT_EQ(12, globalCounters.getTotalCount()); + for (int i = 0; i < 4; ++i) { + globalCounters.halveCounters(); + } + EXPECT_EQ(0, globalCounters.getTotalCount()); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp b/app/src/main/jni/tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp new file mode 100644 index 000000000..ab11975c2 --- /dev/null +++ b/app/src/main/jni/tests/dictionary/structure/v4/content/language_model_dict_content_test.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content.h" + +#include + +#include +#include + +#include "utils/int_array_view.h" + +namespace latinime { +namespace { + +TEST(LanguageModelDictContentTest, TestUnigramProbability) { + LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); + + const int flag = 0xF0; + const int probability = 10; + const int wordId = 100; + const ProbabilityEntry probabilityEntry(flag, probability); + languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); + const ProbabilityEntry entry = + languageModelDictContent.getProbabilityEntry(wordId); + EXPECT_EQ(flag, entry.getFlags()); + EXPECT_EQ(probability, entry.getProbability()); + + // Remove + EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_FALSE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); + EXPECT_FALSE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_TRUE(languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry)); + EXPECT_TRUE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); +} + +TEST(LanguageModelDictContentTest, TestUnigramProbabilityWithHistoricalInfo) { + LanguageModelDictContent languageModelDictContent(true /* useHistoricalInfo */); + + const int flag = 0xF0; + const int timestamp = 0x3FFFFFFF; + const int count = 10; + const int wordId = 100; + const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); + const ProbabilityEntry probabilityEntry(flag, &historicalInfo); + languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); + const ProbabilityEntry entry = languageModelDictContent.getProbabilityEntry(wordId); + EXPECT_EQ(flag, entry.getFlags()); + EXPECT_EQ(timestamp, entry.getHistoricalInfo()->getTimestamp()); + EXPECT_EQ(count, entry.getHistoricalInfo()->getCount()); + + // Remove + EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_FALSE(languageModelDictContent.getProbabilityEntry(wordId).isValid()); + EXPECT_FALSE(languageModelDictContent.removeProbabilityEntry(wordId)); + EXPECT_TRUE(languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry)); + EXPECT_TRUE(languageModelDictContent.removeProbabilityEntry(wordId)); +} + +TEST(LanguageModelDictContentTest, TestIterateProbabilityEntry) { + LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); + + const ProbabilityEntry originalEntry(0xFC, 100); + + const int wordIds[] = { 1, 2, 3, 4, 5 }; + for (const int wordId : wordIds) { + languageModelDictContent.setProbabilityEntry(wordId, &originalEntry); + } + std::unordered_set wordIdSet(std::begin(wordIds), std::end(wordIds)); + for (const auto& entry : languageModelDictContent.getProbabilityEntries(WordIdArrayView())) { + EXPECT_EQ(originalEntry.getFlags(), entry.getProbabilityEntry().getFlags()); + EXPECT_EQ(originalEntry.getProbability(), entry.getProbabilityEntry().getProbability()); + wordIdSet.erase(entry.getWordId()); + } + EXPECT_TRUE(wordIdSet.empty()); +} + +TEST(LanguageModelDictContentTest, TestGetWordProbability) { + LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); + + const int flag = 0xFF; + const int probability = 10; + const int bigramProbability = 20; + const int trigramProbability = 30; + const int wordId = 100; + const std::array prevWordIdArray = {{ 1, 2 }}; + const WordIdArrayView prevWordIds = WordIdArrayView::fromArray(prevWordIdArray); + + const ProbabilityEntry probabilityEntry(flag, probability); + languageModelDictContent.setProbabilityEntry(wordId, &probabilityEntry); + const ProbabilityEntry bigramProbabilityEntry(flag, bigramProbability); + languageModelDictContent.setProbabilityEntry(prevWordIds[0], &probabilityEntry); + languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), wordId, + &bigramProbabilityEntry); + EXPECT_EQ(bigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); + const ProbabilityEntry trigramProbabilityEntry(flag, trigramProbability); + languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(1), + prevWordIds[1], &probabilityEntry); + languageModelDictContent.setNgramProbabilityEntry(prevWordIds.limit(2), wordId, + &trigramProbabilityEntry); + EXPECT_EQ(trigramProbability, languageModelDictContent.getWordAttributes(prevWordIds, wordId, + false /* mustMatchAllPrevWords */, nullptr /* headerPolicy */).getProbability()); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/structure/v4/content/probability_entry_test.cpp b/app/src/main/jni/tests/dictionary/structure/v4/content/probability_entry_test.cpp new file mode 100644 index 000000000..ba81671b5 --- /dev/null +++ b/app/src/main/jni/tests/dictionary/structure/v4/content/probability_entry_test.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/probability_entry.h" + +#include + +#include "defines.h" + +namespace latinime { +namespace { + +TEST(ProbabilityEntryTest, TestEncodeDecode) { + const int flag = 0xFF; + const int probability = 10; + + const ProbabilityEntry entry(flag, probability); + const uint64_t encodedEntry = entry.encode(false /* hasHistoricalInfo */); + const ProbabilityEntry decodedEntry = + ProbabilityEntry::decode(encodedEntry, false /* hasHistoricalInfo */); + EXPECT_EQ(0xFF0Aull, encodedEntry); + EXPECT_EQ(flag, decodedEntry.getFlags()); + EXPECT_EQ(probability, decodedEntry.getProbability()); +} + +TEST(ProbabilityEntryTest, TestEncodeDecodeWithHistoricalInfo) { + const int flag = 0xF0; + const int timestamp = 0x3FFFFFFF; + const int count = 0xABCD; + + const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count); + const ProbabilityEntry entry(flag, &historicalInfo); + + const uint64_t encodedEntry = entry.encode(true /* hasHistoricalInfo */); + EXPECT_EQ(0xF03FFFFFFFABCDull, encodedEntry); + const ProbabilityEntry decodedEntry = + ProbabilityEntry::decode(encodedEntry, true /* hasHistoricalInfo */); + + EXPECT_EQ(flag, decodedEntry.getFlags()); + EXPECT_EQ(timestamp, decodedEntry.getHistoricalInfo()->getTimestamp()); + EXPECT_EQ(count, decodedEntry.getHistoricalInfo()->getCount()); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp b/app/src/main/jni/tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp new file mode 100644 index 000000000..4f23889ca --- /dev/null +++ b/app/src/main/jni/tests/dictionary/structure/v4/content/terminal_position_lookup_table_test.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" + +#include + +#include + +#include "defines.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { +namespace { + +TEST(TerminalPositionLookupTableTest, TestGetFromEmptyTable) { + TerminalPositionLookupTable lookupTable; + + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(0)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(-1)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition( + Ver4DictConstants::NOT_A_TERMINAL_ID)); +} + +TEST(TerminalPositionLookupTableTest, TestSetAndGet) { + TerminalPositionLookupTable lookupTable; + + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(10, 100)); + EXPECT_EQ(100, lookupTable.getTerminalPtNodePosition(10)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(9)); + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(9, 200)); + EXPECT_EQ(200, lookupTable.getTerminalPtNodePosition(9)); + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(10, 300)); + EXPECT_EQ(300, lookupTable.getTerminalPtNodePosition(10)); + EXPECT_FALSE(lookupTable.setTerminalPtNodePosition(-1, 400)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition(-1)); + EXPECT_FALSE(lookupTable.setTerminalPtNodePosition(Ver4DictConstants::NOT_A_TERMINAL_ID, 500)); + EXPECT_EQ(NOT_A_DICT_POS, lookupTable.getTerminalPtNodePosition( + Ver4DictConstants::NOT_A_TERMINAL_ID)); +} + +TEST(TerminalPositionLookupTableTest, TestGC) { + TerminalPositionLookupTable lookupTable; + + const std::vector terminalIds = { 10, 20, 30 }; + const std::vector terminalPositions = { 100, 200, 300 }; + + for (size_t i = 0; i < terminalIds.size(); ++i) { + EXPECT_TRUE(lookupTable.setTerminalPtNodePosition(terminalIds[i], terminalPositions[i])); + } + + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + EXPECT_TRUE(lookupTable.runGCTerminalIds(&terminalIdMap)); + + for (size_t i = 0; i < terminalIds.size(); ++i) { + EXPECT_EQ(static_cast(i), terminalIdMap[terminalIds[i]]) + << "Terminal id (" << terminalIds[i] << ") should be changed to " << i; + EXPECT_EQ(terminalPositions[i], lookupTable.getTerminalPtNodePosition(i)); + } +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/utils/bloom_filter_test.cpp b/app/src/main/jni/tests/dictionary/utils/bloom_filter_test.cpp new file mode 100644 index 000000000..bcc88438c --- /dev/null +++ b/app/src/main/jni/tests/dictionary/utils/bloom_filter_test.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/bloom_filter.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace latinime { +namespace { + +TEST(BloomFilterTest, TestFilter) { + static const int TEST_RANDOM_DATA_MAX = 65536; + static const int ELEMENT_COUNT = 1000; + std::vector elements; + + // Initialize data set with random integers. + { + // Use the uniform integer distribution [0, TEST_RANDOM_DATA_MAX]. + std::uniform_int_distribution distribution(0, TEST_RANDOM_DATA_MAX); + auto randomNumberGenerator = std::bind(distribution, std::mt19937()); + for (int i = 0; i < ELEMENT_COUNT; ++i) { + elements.push_back(randomNumberGenerator()); + } + } + + // Make sure BloomFilter contains nothing by default. + BloomFilter bloomFilter; + for (const int elem : elements) { + ASSERT_FALSE(bloomFilter.isInFilter(elem)); + } + + // Copy some of the test vector into bloom filter. + std::unordered_set elementsThatHaveBeenSetInFilter; + { + // Use the uniform integer distribution [0, 1]. + std::uniform_int_distribution distribution(0, 1); + auto randomBitGenerator = std::bind(distribution, std::mt19937()); + for (const int elem : elements) { + if (randomBitGenerator() == 0) { + bloomFilter.setInFilter(elem); + elementsThatHaveBeenSetInFilter.insert(elem); + } + } + } + + for (const int elem : elements) { + const bool existsInFilter = bloomFilter.isInFilter(elem); + const bool hasBeenSetInFilter = + elementsThatHaveBeenSetInFilter.find(elem) != elementsThatHaveBeenSetInFilter.end(); + if (hasBeenSetInFilter) { + EXPECT_TRUE(existsInFilter) << "elem: " << elem; + } + if (!existsInFilter) { + EXPECT_FALSE(hasBeenSetInFilter) << "elem: " << elem; + } + } +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp b/app/src/main/jni/tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp new file mode 100644 index 000000000..25878910b --- /dev/null +++ b/app/src/main/jni/tests/dictionary/utils/buffer_with_extendable_buffer_test.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +#include + +namespace latinime { +namespace { + +const int DEFAULT_MAX_BUFFER_SIZE = 1024; + +TEST(BufferWithExtendablebufferTest, TestWriteAndRead) { + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + int pos = 0; + // 1 byte + const uint32_t data_1 = 0xFF; + EXPECT_TRUE(buffer.writeUint(data_1, 1 /* size */, pos)); + EXPECT_EQ(data_1, buffer.readUint(1, pos)); + pos += 1; + // 2 byte + const uint32_t data_2 = 0xFFFF; + EXPECT_TRUE(buffer.writeUint(data_2, 2 /* size */, pos)); + EXPECT_EQ(data_2, buffer.readUint(2, pos)); + pos += 2; + // 3 byte + const uint32_t data_3 = 0xFFFFFF; + EXPECT_TRUE(buffer.writeUint(data_3, 3 /* size */, pos)); + EXPECT_EQ(data_3, buffer.readUint(3, pos)); + pos += 3; + // 4 byte + const uint32_t data_4 = 0xFFFFFFFF; + EXPECT_TRUE(buffer.writeUint(data_4, 4 /* size */, pos)); + EXPECT_EQ(data_4, buffer.readUint(4, pos)); +} + +TEST(BufferWithExtendablebufferTest, TestExtend) { + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_EQ(0, buffer.getTailPosition()); + EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); + EXPECT_EQ(4, buffer.getTailPosition()); + EXPECT_TRUE(buffer.extend(8 /* size */)); + EXPECT_EQ(12, buffer.getTailPosition()); + EXPECT_TRUE(buffer.writeUint(0xFFFF /* data */, 4 /* size */, 8 /* pos */)); + EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); +} + +TEST(BufferWithExtendablebufferTest, TestCopy) { + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_TRUE(buffer.writeUint(0xFF /* data */, 4 /* size */, 0 /* pos */)); + EXPECT_TRUE(buffer.writeUint(0xFFFF /* data */, 4 /* size */, 4 /* pos */)); + BufferWithExtendableBuffer targetBuffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_TRUE(targetBuffer.copy(&buffer)); + EXPECT_EQ(0xFFu, targetBuffer.readUint(4 /* size */, 0 /* pos */)); + EXPECT_EQ(0xFFFFu, targetBuffer.readUint(4 /* size */, 4 /* pos */)); +} + +TEST(BufferWithExtendablebufferTest, TestSizeLimit) { + BufferWithExtendableBuffer emptyBuffer(0 /* maxAdditionalBufferSize */); + EXPECT_FALSE(emptyBuffer.writeUint(0 /* data */, 1 /* size */, 0 /* pos */)); + EXPECT_FALSE(emptyBuffer.extend(1 /* size */)); + + BufferWithExtendableBuffer smallBuffer(4 /* maxAdditionalBufferSize */); + EXPECT_TRUE(smallBuffer.writeUint(0 /* data */, 4 /* size */, 0 /* pos */)); + EXPECT_FALSE(smallBuffer.writeUint(0 /* data */, 1 /* size */, 4 /* pos */)); + + EXPECT_TRUE(smallBuffer.copy(&emptyBuffer)); + EXPECT_FALSE(emptyBuffer.copy(&smallBuffer)); + + BufferWithExtendableBuffer buffer(DEFAULT_MAX_BUFFER_SIZE); + EXPECT_FALSE(buffer.isNearSizeLimit()); + int pos = 0; + while (!buffer.isNearSizeLimit()) { + EXPECT_TRUE(buffer.writeUintAndAdvancePosition(0 /* data */, 4 /* size */, &pos)); + } + EXPECT_GT(pos, 0); + EXPECT_LE(pos, DEFAULT_MAX_BUFFER_SIZE); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/utils/byte_array_utils_test.cpp b/app/src/main/jni/tests/dictionary/utils/byte_array_utils_test.cpp new file mode 100644 index 000000000..07257530b --- /dev/null +++ b/app/src/main/jni/tests/dictionary/utils/byte_array_utils_test.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/byte_array_utils.h" + +#include + +#include + +namespace latinime { +namespace { + +TEST(ByteArrayUtilsTest, TestReadCodePointTable) { + const int codePointTable[] = { 0x6f, 0x6b }; + const uint8_t buffer[] = { 0x20u, 0x21u, 0x00u, 0x01u, 0x00u }; + int pos = 0; + // Expect the first entry of codePointTable + EXPECT_EQ(0x6f, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); + // Expect the second entry of codePointTable + EXPECT_EQ(0x6b, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); + // Expect the original code point from buffer[2] to buffer[4], 0x100 + // It isn't picked from the codePointTable, since it exceeds the range of the codePointTable. + EXPECT_EQ(0x100, ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, &pos)); +} + +TEST(ByteArrayUtilsTest, TestReadInt) { + const uint8_t buffer[] = { 0x1u, 0x8Au, 0x0u, 0xAAu }; + + EXPECT_EQ(0x01u, ByteArrayUtils::readUint8(buffer, 0)); + EXPECT_EQ(0x8Au, ByteArrayUtils::readUint8(buffer, 1)); + EXPECT_EQ(0x0u, ByteArrayUtils::readUint8(buffer, 2)); + EXPECT_EQ(0xAAu, ByteArrayUtils::readUint8(buffer, 3)); + + EXPECT_EQ(0x018Au, ByteArrayUtils::readUint16(buffer, 0)); + EXPECT_EQ(0x8A00u, ByteArrayUtils::readUint16(buffer, 1)); + EXPECT_EQ(0xAAu, ByteArrayUtils::readUint16(buffer, 2)); + + EXPECT_EQ(0x18A00AAu, ByteArrayUtils::readUint32(buffer, 0)); + + int pos = 0; + EXPECT_EQ(0x18A00, ByteArrayUtils::readSint24AndAdvancePosition(buffer, &pos)); + pos = 1; + EXPECT_EQ(-0xA00AA, ByteArrayUtils::readSint24AndAdvancePosition(buffer, &pos)); +} + +TEST(ByteArrayUtilsTest, TestWriteAndReadInt) { + uint8_t buffer[4]; + + int pos = 0; + const uint8_t data_1B = 0xC8; + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data_1B, 1, &pos); + EXPECT_EQ(data_1B, ByteArrayUtils::readUint(buffer, 1, 0)); + + pos = 0; + const uint32_t data_4B = 0xABCD1234; + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data_4B, 4, &pos); + EXPECT_EQ(data_4B, ByteArrayUtils::readUint(buffer, 4, 0)); +} + +TEST(ByteArrayUtilsTest, TestReadCodePoint) { + const uint8_t buffer[] = { 0x10, 0xFF, 0x00u, 0x20u, 0x41u, 0x1Fu, 0x60 }; + + EXPECT_EQ(0x10FF00, ByteArrayUtils::readCodePoint(buffer, 0)); + EXPECT_EQ(0x20, ByteArrayUtils::readCodePoint(buffer, 3)); + EXPECT_EQ(0x41, ByteArrayUtils::readCodePoint(buffer, 4)); + EXPECT_EQ(NOT_A_CODE_POINT, ByteArrayUtils::readCodePoint(buffer, 5)); + + int pos = 0; + int codePointArray[3]; + EXPECT_EQ(3, ByteArrayUtils::readStringAndAdvancePosition(buffer, MAX_WORD_LENGTH, nullptr, + codePointArray, &pos)); + EXPECT_EQ(0x10FF00, codePointArray[0]); + EXPECT_EQ(0x20, codePointArray[1]); + EXPECT_EQ(0x41, codePointArray[2]); + EXPECT_EQ(0x60, ByteArrayUtils::readCodePoint(buffer, pos)); +} + +TEST(ByteArrayUtilsTest, TestWriteAndReadCodePoint) { + uint8_t buffer[10]; + + const int codePointArray[] = { 0x10FF00, 0x20, 0x41 }; + int pos = 0; + ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePointArray, 3, + true /* writesTerminator */, &pos); + EXPECT_EQ(0x10FF00, ByteArrayUtils::readCodePoint(buffer, 0)); + EXPECT_EQ(0x20, ByteArrayUtils::readCodePoint(buffer, 3)); + EXPECT_EQ(0x41, ByteArrayUtils::readCodePoint(buffer, 4)); + EXPECT_EQ(NOT_A_CODE_POINT, ByteArrayUtils::readCodePoint(buffer, 5)); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/utils/format_utils_test.cpp b/app/src/main/jni/tests/dictionary/utils/format_utils_test.cpp new file mode 100644 index 000000000..3561bda30 --- /dev/null +++ b/app/src/main/jni/tests/dictionary/utils/format_utils_test.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/format_utils.h" + +#include + +#include + +#include "utils/byte_array_view.h" + +namespace latinime { +namespace { + +TEST(FormatUtilsTest, TestMagicNumber) { + EXPECT_EQ(0x9BC13AFE, FormatUtils::MAGIC_NUMBER) << "Magic number must not be changed."; +} + +const std::vector getBuffer(const int magicNumber, const int version, const uint16_t flags, + const size_t headerSize) { + std::vector buffer; + buffer.push_back(magicNumber >> 24); + buffer.push_back(magicNumber >> 16); + buffer.push_back(magicNumber >> 8); + buffer.push_back(magicNumber); + + buffer.push_back(version >> 8); + buffer.push_back(version); + + buffer.push_back(flags >> 8); + buffer.push_back(flags); + + buffer.push_back(headerSize >> 24); + buffer.push_back(headerSize >> 16); + buffer.push_back(headerSize >> 8); + buffer.push_back(headerSize); + return buffer; +} + +TEST(FormatUtilsTest, TestDetectFormatVersion) { + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, + FormatUtils::detectFormatVersion(ReadOnlyByteArrayView())); + + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_2, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_2, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_402, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_402, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_403, 0, 0); + EXPECT_EQ(FormatUtils::VERSION_403, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER - 1, FormatUtils::VERSION_2, 0, 0); + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, 100, 0, 0); + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size()))); + } + { + const std::vector buffer = + getBuffer(FormatUtils::MAGIC_NUMBER, FormatUtils::VERSION_2, 0, 0); + EXPECT_EQ(FormatUtils::UNKNOWN_VERSION, FormatUtils::detectFormatVersion( + ReadOnlyByteArrayView(buffer.data(), buffer.size() - 1))); + } +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/utils/probability_utils_test.cpp b/app/src/main/jni/tests/dictionary/utils/probability_utils_test.cpp new file mode 100644 index 000000000..4020ea441 --- /dev/null +++ b/app/src/main/jni/tests/dictionary/utils/probability_utils_test.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/probability_utils.h" + +#include + +#include "defines.h" + +namespace latinime { +namespace { + +TEST(ProbabilityUtilsTest, TestEncodeRawProbability) { + EXPECT_EQ(MAX_PROBABILITY, ProbabilityUtils::encodeRawProbability(1.0f)); + EXPECT_EQ(MAX_PROBABILITY - 9, ProbabilityUtils::encodeRawProbability(0.5f)); + EXPECT_EQ(0, ProbabilityUtils::encodeRawProbability(0.0f)); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/utils/sparse_table_test.cpp b/app/src/main/jni/tests/dictionary/utils/sparse_table_test.cpp new file mode 100644 index 000000000..237c9631c --- /dev/null +++ b/app/src/main/jni/tests/dictionary/utils/sparse_table_test.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/sparse_table.h" + +#include + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { +namespace { + +TEST(SparseTableTest, TestSetAndGet) { + static const int BLOCK_SIZE = 64; + static const int DATA_SIZE = 4; + BufferWithExtendableBuffer indexTableBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + BufferWithExtendableBuffer contentTableBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + SparseTable sparseTable(&indexTableBuffer, &contentTableBuffer, BLOCK_SIZE, DATA_SIZE); + + EXPECT_FALSE(sparseTable.contains(10)); + EXPECT_TRUE(sparseTable.set(10, 100u)); + EXPECT_EQ(100u, sparseTable.get(10)); + EXPECT_TRUE(sparseTable.contains(10)); + EXPECT_TRUE(sparseTable.contains(BLOCK_SIZE - 1)); + EXPECT_FALSE(sparseTable.contains(BLOCK_SIZE)); + EXPECT_TRUE(sparseTable.set(11, 101u)); + EXPECT_EQ(100u, sparseTable.get(10)); + EXPECT_EQ(101u, sparseTable.get(11)); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/dictionary/utils/trie_map_test.cpp b/app/src/main/jni/tests/dictionary/utils/trie_map_test.cpp new file mode 100644 index 000000000..8f3ec9d24 --- /dev/null +++ b/app/src/main/jni/tests/dictionary/utils/trie_map_test.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/trie_map.h" + +#include + +#include +#include +#include +#include +#include +#include + +namespace latinime { +namespace { + +TEST(TrieMapTest, TestSetAndGet) { + TrieMap trieMap; + trieMap.putRoot(10, 10); + EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); + trieMap.putRoot(0x10A, 10); + EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); + EXPECT_EQ(10ull, trieMap.getRoot(0x10A).mValue); + trieMap.putRoot(10, 1000); + EXPECT_EQ(1000ull, trieMap.getRoot(10).mValue); + trieMap.putRoot(11, 1000); + EXPECT_EQ(1000ull, trieMap.getRoot(11).mValue); + const int next = trieMap.getNextLevelBitmapEntryIndex(10); + EXPECT_EQ(1000ull, trieMap.getRoot(10).mValue); + trieMap.put(9, 9, next); + EXPECT_EQ(9ull, trieMap.get(9, next).mValue); + EXPECT_FALSE(trieMap.get(11, next).mIsValid); + trieMap.putRoot(0, 0xFFFFFFFFFull); + EXPECT_EQ(0xFFFFFFFFFull, trieMap.getRoot(0).mValue); +} + +TEST(TrieMapTest, TestRemove) { + TrieMap trieMap; + trieMap.putRoot(10, 10); + EXPECT_EQ(10ull, trieMap.getRoot(10).mValue); + EXPECT_TRUE(trieMap.remove(10, trieMap.getRootBitmapEntryIndex())); + EXPECT_FALSE(trieMap.getRoot(10).mIsValid); + for (const auto &element : trieMap.getEntriesInRootLevel()) { + (void)element; // not used + EXPECT_TRUE(false); + } + EXPECT_TRUE(trieMap.putRoot(10, 0x3FFFFF)); + EXPECT_FALSE(trieMap.remove(11, trieMap.getRootBitmapEntryIndex())) + << "Should fail if the key does not exist."; + EXPECT_EQ(0x3FFFFFull, trieMap.getRoot(10).mValue); + trieMap.putRoot(12, 11); + const int nextLevel = trieMap.getNextLevelBitmapEntryIndex(10); + trieMap.put(10, 10, nextLevel); + EXPECT_EQ(0x3FFFFFull, trieMap.getRoot(10).mValue); + EXPECT_EQ(10ull, trieMap.get(10, nextLevel).mValue); + EXPECT_TRUE(trieMap.remove(10, trieMap.getRootBitmapEntryIndex())); + const TrieMap::Result result = trieMap.getRoot(10); + EXPECT_FALSE(result.mIsValid); + EXPECT_EQ(TrieMap::INVALID_INDEX, result.mNextLevelBitmapEntryIndex); + EXPECT_EQ(11ull, trieMap.getRoot(12).mValue); + EXPECT_TRUE(trieMap.putRoot(S_INT_MAX, 0xFFFFFFFFFull)); + EXPECT_TRUE(trieMap.remove(S_INT_MAX, trieMap.getRootBitmapEntryIndex())); +} + +TEST(TrieMapTest, TestSetAndGetLarge) { + static const int ELEMENT_COUNT = 200000; + TrieMap trieMap; + for (int i = 0; i < ELEMENT_COUNT; ++i) { + EXPECT_TRUE(trieMap.putRoot(i, i)); + } + for (int i = 0; i < ELEMENT_COUNT; ++i) { + EXPECT_EQ(static_cast(i), trieMap.getRoot(i).mValue); + } +} + +TEST(TrieMapTest, TestRandSetAndGetLarge) { + static const int ELEMENT_COUNT = 100000; + TrieMap trieMap; + std::unordered_map testKeyValuePairs; + + // Use the uniform integer distribution [S_INT_MIN, S_INT_MAX]. + std::uniform_int_distribution keyDistribution(S_INT_MIN, S_INT_MAX); + auto keyRandomNumberGenerator = std::bind(keyDistribution, std::mt19937()); + + // Use the uniform distribution [0, TrieMap::MAX_VALUE]. + std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); + auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); + + for (int i = 0; i < ELEMENT_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + EXPECT_TRUE(trieMap.putRoot(key, value)) << key << " " << value; + testKeyValuePairs[key] = value; + } + for (const auto &v : testKeyValuePairs) { + EXPECT_EQ(v.second, trieMap.getRoot(v.first).mValue); + } +} + +TEST(TrieMapTest, TestMultiLevel) { + static const int FIRST_LEVEL_ENTRY_COUNT = 10000; + static const int SECOND_LEVEL_ENTRY_COUNT = 20000; + static const int THIRD_LEVEL_ENTRY_COUNT = 40000; + + TrieMap trieMap; + std::vector firstLevelKeys; + std::map firstLevelEntries; + std::vector> secondLevelKeys; + std::map> twoLevelMap; + std::map>> threeLevelMap; + + // Use the uniform integer distribution [0, S_INT_MAX]. + std::uniform_int_distribution distribution(0, S_INT_MAX); + auto keyRandomNumberGenerator = std::bind(distribution, std::mt19937()); + auto randomNumberGeneratorForKeySelection = std::bind(distribution, std::mt19937()); + + // Use the uniform distribution [0, TrieMap::MAX_VALUE]. + std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); + auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); + + for (int i = 0; i < FIRST_LEVEL_ENTRY_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + EXPECT_TRUE(trieMap.putRoot(key, value)); + firstLevelKeys.push_back(key); + firstLevelEntries[key] = value; + } + + for (int i = 0; i < SECOND_LEVEL_ENTRY_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + const int firstLevelKey = + firstLevelKeys[randomNumberGeneratorForKeySelection() % FIRST_LEVEL_ENTRY_COUNT]; + const int nextLevelBitmapEntryIndex = trieMap.getNextLevelBitmapEntryIndex(firstLevelKey); + EXPECT_NE(TrieMap::INVALID_INDEX, nextLevelBitmapEntryIndex); + EXPECT_TRUE(trieMap.put(key, value, nextLevelBitmapEntryIndex)); + secondLevelKeys.push_back(std::make_pair(firstLevelKey, key)); + twoLevelMap[firstLevelKey][key] = value; + } + + for (int i = 0; i < THIRD_LEVEL_ENTRY_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + const std::pair secondLevelKey = + secondLevelKeys[randomNumberGeneratorForKeySelection() % SECOND_LEVEL_ENTRY_COUNT]; + const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(secondLevelKey.first); + EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); + const int thirdLevel = trieMap.getNextLevelBitmapEntryIndex( + secondLevelKey.second, secondLevel); + EXPECT_NE(TrieMap::INVALID_INDEX, thirdLevel); + EXPECT_TRUE(trieMap.put(key, value, thirdLevel)); + threeLevelMap[secondLevelKey.first][secondLevelKey.second][key] = value; + } + + for (const auto &firstLevelEntry : firstLevelEntries) { + EXPECT_EQ(firstLevelEntry.second, trieMap.getRoot(firstLevelEntry.first).mValue); + } + + for (const auto &firstLevelEntry : twoLevelMap) { + const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(firstLevelEntry.first); + EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); + for (const auto &secondLevelEntry : firstLevelEntry.second) { + EXPECT_EQ(secondLevelEntry.second, + trieMap.get(secondLevelEntry.first, secondLevel).mValue); + } + } + + for (const auto &firstLevelEntry : threeLevelMap) { + const int secondLevel = trieMap.getNextLevelBitmapEntryIndex(firstLevelEntry.first); + EXPECT_NE(TrieMap::INVALID_INDEX, secondLevel); + for (const auto &secondLevelEntry : firstLevelEntry.second) { + const int thirdLevel = + trieMap.getNextLevelBitmapEntryIndex(secondLevelEntry.first, secondLevel); + EXPECT_NE(TrieMap::INVALID_INDEX, thirdLevel); + for (const auto &thirdLevelEntry : secondLevelEntry.second) { + EXPECT_EQ(thirdLevelEntry.second, + trieMap.get(thirdLevelEntry.first, thirdLevel).mValue); + } + } + } + + // Iteration + for (const auto &firstLevelEntry : trieMap.getEntriesInRootLevel()) { + EXPECT_EQ(trieMap.getRoot(firstLevelEntry.key()).mValue, firstLevelEntry.value()); + EXPECT_EQ(firstLevelEntries[firstLevelEntry.key()], firstLevelEntry.value()); + firstLevelEntries.erase(firstLevelEntry.key()); + for (const auto &secondLevelEntry : firstLevelEntry.getEntriesInNextLevel()) { + EXPECT_EQ(twoLevelMap[firstLevelEntry.key()][secondLevelEntry.key()], + secondLevelEntry.value()); + twoLevelMap[firstLevelEntry.key()].erase(secondLevelEntry.key()); + for (const auto &thirdLevelEntry : secondLevelEntry.getEntriesInNextLevel()) { + EXPECT_EQ(threeLevelMap[firstLevelEntry.key()][secondLevelEntry.key()] + [thirdLevelEntry.key()], thirdLevelEntry.value()); + threeLevelMap[firstLevelEntry.key()][secondLevelEntry.key()].erase( + thirdLevelEntry.key()); + } + } + } + + // Ensure all entries have been traversed. + EXPECT_TRUE(firstLevelEntries.empty()); + for (const auto &secondLevelEntry : twoLevelMap) { + EXPECT_TRUE(secondLevelEntry.second.empty()); + } + for (const auto &secondLevelEntry : threeLevelMap) { + for (const auto &thirdLevelEntry : secondLevelEntry.second) { + EXPECT_TRUE(thirdLevelEntry.second.empty()); + } + } +} + +TEST(TrieMapTest, TestIteration) { + static const int ELEMENT_COUNT = 200000; + TrieMap trieMap; + std::unordered_map testKeyValuePairs; + + // Use the uniform integer distribution [S_INT_MIN, S_INT_MAX]. + std::uniform_int_distribution keyDistribution(S_INT_MIN, S_INT_MAX); + auto keyRandomNumberGenerator = std::bind(keyDistribution, std::mt19937()); + + // Use the uniform distribution [0, TrieMap::MAX_VALUE]. + std::uniform_int_distribution valueDistribution(0, TrieMap::MAX_VALUE); + auto valueRandomNumberGenerator = std::bind(valueDistribution, std::mt19937()); + for (int i = 0; i < ELEMENT_COUNT; ++i) { + const int key = keyRandomNumberGenerator(); + const uint64_t value = valueRandomNumberGenerator(); + EXPECT_TRUE(trieMap.putRoot(key, value)); + testKeyValuePairs[key] = value; + } + for (const auto &entry : trieMap.getEntriesInRootLevel()) { + EXPECT_EQ(trieMap.getRoot(entry.key()).mValue, entry.value()); + EXPECT_EQ(testKeyValuePairs[entry.key()], entry.value()); + testKeyValuePairs.erase(entry.key()); + } + EXPECT_TRUE(testKeyValuePairs.empty()); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/suggest/core/dicnode/dic_node_pool_test.cpp b/app/src/main/jni/tests/suggest/core/dicnode/dic_node_pool_test.cpp new file mode 100644 index 000000000..854efdfe6 --- /dev/null +++ b/app/src/main/jni/tests/suggest/core/dicnode/dic_node_pool_test.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dicnode/dic_node_pool.h" + +#include + +namespace latinime { +namespace { + +TEST(DicNodePoolTest, TestGet) { + static const int CAPACITY = 10; + DicNodePool dicNodePool(CAPACITY); + + for (int i = 0; i < CAPACITY; ++i) { + EXPECT_NE(nullptr, dicNodePool.getInstance()); + } + EXPECT_EQ(nullptr, dicNodePool.getInstance()); +} + +TEST(DicNodePoolTest, TestPlaceBack) { + static const int CAPACITY = 1; + DicNodePool dicNodePool(CAPACITY); + + DicNode *const dicNode = dicNodePool.getInstance(); + EXPECT_NE(nullptr, dicNode); + EXPECT_EQ(nullptr, dicNodePool.getInstance()); + dicNodePool.placeBackInstance(dicNode); + EXPECT_EQ(dicNode, dicNodePool.getInstance()); +} + +TEST(DicNodePoolTest, TestReset) { + static const int CAPACITY_SMALL = 2; + static const int CAPACITY_LARGE = 10; + DicNodePool dicNodePool(CAPACITY_SMALL); + + for (int i = 0; i < CAPACITY_SMALL; ++i) { + EXPECT_NE(nullptr, dicNodePool.getInstance()); + } + EXPECT_EQ(nullptr, dicNodePool.getInstance()); + + dicNodePool.reset(CAPACITY_LARGE); + for (int i = 0; i < CAPACITY_LARGE; ++i) { + EXPECT_NE(nullptr, dicNodePool.getInstance()); + } + EXPECT_EQ(nullptr, dicNodePool.getInstance()); + + dicNodePool.reset(CAPACITY_SMALL); + for (int i = 0; i < CAPACITY_SMALL; ++i) { + EXPECT_NE(nullptr, dicNodePool.getInstance()); + } + EXPECT_EQ(nullptr, dicNodePool.getInstance()); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/suggest/core/layout/geometry_utils_test.cpp b/app/src/main/jni/tests/suggest/core/layout/geometry_utils_test.cpp new file mode 100644 index 000000000..f5f89ede1 --- /dev/null +++ b/app/src/main/jni/tests/suggest/core/layout/geometry_utils_test.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/layout/geometry_utils.h" + +#include + +namespace latinime { +namespace { + +::testing::AssertionResult ExpectAngleDiffEq(const char* expectedExpression, + const char* actualExpression, float expected, float actual) { + if (actual < 0.0f || M_PI_F < actual) { + return ::testing::AssertionFailure() + << "Must be in the range of [0.0f, M_PI_F]." + << " expected: " << expected + << " actual: " << actual; + } + return ::testing::internal::CmpHelperFloatingPointEQ( + expectedExpression, actualExpression, expected, actual); +} + +#define EXPECT_ANGLE_DIFF_EQ(expected, actual) \ + EXPECT_PRED_FORMAT2(ExpectAngleDiffEq, expected, actual); + +TEST(GeometryUtilsTest, testSquareFloat) { + const float test_data[] = { 0.0f, 1.0f, 123.456f, -1.0f, -9876.54321f }; + for (const float value : test_data) { + EXPECT_FLOAT_EQ(value * value, GeometryUtils::SQUARE_FLOAT(value)); + } +} + +TEST(GeometryUtilsTest, testGetAngle) { + EXPECT_FLOAT_EQ(0.0f, GeometryUtils::getAngle(0, 0, 0, 0)); + EXPECT_FLOAT_EQ(0.0f, GeometryUtils::getAngle(100, -10, 100, -10)); + + EXPECT_FLOAT_EQ(M_PI_F / 4.0f, GeometryUtils::getAngle(1, 1, 0, 0)); + EXPECT_FLOAT_EQ(M_PI_F, GeometryUtils::getAngle(-1, 0, 0, 0)); + + EXPECT_FLOAT_EQ(GeometryUtils::getAngle(0, 0, -1, 0), GeometryUtils::getAngle(1, 0, 0, 0)); + EXPECT_FLOAT_EQ(GeometryUtils::getAngle(1, 2, 3, 4), + GeometryUtils::getAngle(100, 200, 300, 400)); +} + +TEST(GeometryUtilsTest, testGetAngleDiff) { + EXPECT_ANGLE_DIFF_EQ(0.0f, GeometryUtils::getAngleDiff(0.0f, 0.0f)); + EXPECT_ANGLE_DIFF_EQ(0.0f, GeometryUtils::getAngleDiff(10000.0f, 10000.0f)); + EXPECT_ANGLE_DIFF_EQ(ROUND_FLOAT_10000(M_PI_F), + GeometryUtils::getAngleDiff(0.0f, M_PI_F)); + EXPECT_ANGLE_DIFF_EQ(ROUND_FLOAT_10000(M_PI_F / 6.0f), + GeometryUtils::getAngleDiff(M_PI_F / 3.0f, M_PI_F / 2.0f)); + EXPECT_ANGLE_DIFF_EQ(ROUND_FLOAT_10000(M_PI_F / 2.0f), + GeometryUtils::getAngleDiff(0.0f, M_PI_F * 1.5f)); + EXPECT_ANGLE_DIFF_EQ(0.0f, GeometryUtils::getAngleDiff(0.0f, M_PI_F * 1024.0f)); + EXPECT_ANGLE_DIFF_EQ(0.0f, GeometryUtils::getAngleDiff(-M_PI_F, M_PI_F)); +} + +TEST(GeometryUtilsTest, testGetDistanceInt) { + EXPECT_EQ(0, GeometryUtils::getDistanceInt(0, 0, 0, 0)); + EXPECT_EQ(0, GeometryUtils::getAngle(100, -10, 100, -10)); + + EXPECT_EQ(5, GeometryUtils::getDistanceInt(0, 0, 5, 0)); + EXPECT_EQ(5, GeometryUtils::getDistanceInt(0, 0, 3, 4)); + EXPECT_EQ(5, GeometryUtils::getDistanceInt(0, -4, 3, 0)); + EXPECT_EQ(5, GeometryUtils::getDistanceInt(0, 0, -3, -4)); + EXPECT_EQ(500, GeometryUtils::getDistanceInt(0, 0, 300, -400)); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/suggest/core/layout/normal_distribution_2d_test.cpp b/app/src/main/jni/tests/suggest/core/layout/normal_distribution_2d_test.cpp new file mode 100644 index 000000000..1d6a27c4f --- /dev/null +++ b/app/src/main/jni/tests/suggest/core/layout/normal_distribution_2d_test.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/layout/normal_distribution_2d.h" + +#include + +#include + +namespace latinime { +namespace { + +static const float ORIGIN_X = 0.0f; +static const float ORIGIN_Y = 0.0f; +static const float LARGE_STANDARD_DEVIATION = 100.0f; +static const float SMALL_STANDARD_DEVIATION = 10.0f; +static const float ZERO_RADIAN = 0.0f; + +TEST(NormalDistribution2DTest, ProbabilityDensity) { + const NormalDistribution2D distribution(ORIGIN_X, LARGE_STANDARD_DEVIATION, ORIGIN_Y, + SMALL_STANDARD_DEVIATION, ZERO_RADIAN); + + static const float SMALL_COORDINATE = 10.0f; + static const float LARGE_COORDINATE = 20.0f; + // The probability density of the point near the distribution center is larger than the + // probability density of the point that is far from distribution center. + EXPECT_GE(distribution.getProbabilityDensity(SMALL_COORDINATE, SMALL_COORDINATE), + distribution.getProbabilityDensity(LARGE_COORDINATE, LARGE_COORDINATE)); + // The probability density of the point shifted toward the direction that has larger standard + // deviation is larger than the probability density of the point shifted towards another + // direction. + EXPECT_GE(distribution.getProbabilityDensity(LARGE_COORDINATE, SMALL_COORDINATE), + distribution.getProbabilityDensity(SMALL_COORDINATE, LARGE_COORDINATE)); +} + +TEST(NormalDistribution2DTest, Rotate) { + static const float COORDINATES[] = {0.0f, 10.0f, 100.0f, -20.0f}; + static const float EPSILON = 0.01f; + const NormalDistribution2D distribution(ORIGIN_X, LARGE_STANDARD_DEVIATION, ORIGIN_Y, + SMALL_STANDARD_DEVIATION, ZERO_RADIAN); + const NormalDistribution2D rotatedDistribution(ORIGIN_X, LARGE_STANDARD_DEVIATION, ORIGIN_Y, + SMALL_STANDARD_DEVIATION, M_PI_4); + for (const float x : COORDINATES) { + for (const float y : COORDINATES) { + // The probability density of the rotated distribution at the point and the probability + // density of the original distribution at the rotated point are the same. + const float probabilityDensity0 = distribution.getProbabilityDensity(x, y); + const float probabilityDensity1 = rotatedDistribution.getProbabilityDensity(-y, x); + EXPECT_NEAR(probabilityDensity0, probabilityDensity1, EPSILON); + } + } +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp b/app/src/main/jni/tests/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp new file mode 100644 index 000000000..d13417964 --- /dev/null +++ b/app/src/main/jni/tests/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy_test.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h" + +#include + +#include + +#include "suggest/policyimpl/utils/edit_distance.h" +#include "utils/int_array_view.h" + +namespace latinime { +namespace { + +TEST(DamerauLevenshteinEditDistancePolicyTest, TestConstructPolicy) { + const std::vector codePoints0 = { 0x20, 0x40, 0x60 }; + const std::vector codePoints1 = { 0x10, 0x20, 0x30, 0x40, 0x50, 0x60 }; + DamerauLevenshteinEditDistancePolicy policy(codePoints0.data(), codePoints0.size(), + codePoints1.data(), codePoints1.size()); + + EXPECT_EQ(static_cast(codePoints0.size()), policy.getString0Length()); + EXPECT_EQ(static_cast(codePoints1.size()), policy.getString1Length()); +} + +float getEditDistance(const std::vector &codePoints0, const std::vector &codePoints1) { + DamerauLevenshteinEditDistancePolicy policy(codePoints0.data(), codePoints0.size(), + codePoints1.data(), codePoints1.size()); + return EditDistance::getEditDistance(&policy); +} + +TEST(DamerauLevenshteinEditDistancePolicyTest, TestEditDistance) { + EXPECT_FLOAT_EQ(0.0f, getEditDistance({}, {})); + EXPECT_FLOAT_EQ(0.0f, getEditDistance({ 1 }, { 1 })); + EXPECT_FLOAT_EQ(0.0f, getEditDistance({ 1, 2, 3 }, { 1, 2, 3 })); + + EXPECT_FLOAT_EQ(1.0f, getEditDistance({ 1 }, { })); + EXPECT_FLOAT_EQ(1.0f, getEditDistance({}, { 100 })); + EXPECT_FLOAT_EQ(5.0f, getEditDistance({}, { 1, 2, 3, 4, 5 })); + + EXPECT_FLOAT_EQ(1.0f, getEditDistance({ 0 }, { 100 })); + EXPECT_FLOAT_EQ(5.0f, getEditDistance({ 1, 2, 3, 4, 5 }, { 11, 12, 13, 14, 15 })); + + EXPECT_FLOAT_EQ(1.0f, getEditDistance({ 1 }, { 1, 2 })); + EXPECT_FLOAT_EQ(2.0f, getEditDistance({ 1, 2 }, { 0, 1, 2, 3 })); + EXPECT_FLOAT_EQ(2.0f, getEditDistance({ 0, 1, 2, 3 }, { 1, 2 })); + + EXPECT_FLOAT_EQ(1.0f, getEditDistance({ 1, 2 }, { 2, 1 })); + EXPECT_FLOAT_EQ(2.0f, getEditDistance({ 1, 2, 3, 4 }, { 2, 1, 4, 3 })); +} +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/utils/autocorrection_threshold_utils_test.cpp b/app/src/main/jni/tests/utils/autocorrection_threshold_utils_test.cpp new file mode 100644 index 000000000..cc8db700f --- /dev/null +++ b/app/src/main/jni/tests/utils/autocorrection_threshold_utils_test.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/autocorrection_threshold_utils.h" + +#include + +#include + +namespace latinime { +namespace { + +int CalcEditDistance(const std::vector &before, + const std::vector &after) { + return AutocorrectionThresholdUtils::editDistance( + &before[0], before.size(), &after[0], after.size()); +} + +TEST(AutocorrectionThresholdUtilsTest, SameData) { + EXPECT_EQ(0, CalcEditDistance({1}, {1})); + EXPECT_EQ(0, CalcEditDistance({2, 2}, {2, 2})); + EXPECT_EQ(0, CalcEditDistance({3, 3, 3}, {3, 3, 3})); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/utils/char_utils_test.cpp b/app/src/main/jni/tests/utils/char_utils_test.cpp new file mode 100644 index 000000000..01d534043 --- /dev/null +++ b/app/src/main/jni/tests/utils/char_utils_test.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/char_utils.h" + +#include + +#include "defines.h" + +namespace latinime { +namespace { + +TEST(CharUtilsTest, TestIsAsciiUpper) { + EXPECT_TRUE(CharUtils::isAsciiUpper('A')); + EXPECT_TRUE(CharUtils::isAsciiUpper('Z')); + EXPECT_FALSE(CharUtils::isAsciiUpper('a')); + EXPECT_FALSE(CharUtils::isAsciiUpper('z')); + EXPECT_FALSE(CharUtils::isAsciiUpper('@')); + EXPECT_FALSE(CharUtils::isAsciiUpper(' ')); + EXPECT_FALSE(CharUtils::isAsciiUpper(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); + EXPECT_FALSE(CharUtils::isAsciiUpper(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); + EXPECT_FALSE(CharUtils::isAsciiUpper(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); + EXPECT_FALSE(CharUtils::isAsciiUpper(0x0410 /* CYRILLIC CAPITAL LETTER A */)); + EXPECT_FALSE(CharUtils::isAsciiUpper(0x0430 /* CYRILLIC SMALL LETTER A */)); + EXPECT_FALSE(CharUtils::isAsciiUpper(0x3042 /* HIRAGANA LETTER A */)); + EXPECT_FALSE(CharUtils::isAsciiUpper(0x1F36A /* COOKIE */)); +} + +TEST(CharUtilsTest, TestToLowerCase) { + EXPECT_EQ('a', CharUtils::toLowerCase('A')); + EXPECT_EQ('z', CharUtils::toLowerCase('Z')); + EXPECT_EQ('a', CharUtils::toLowerCase('a')); + EXPECT_EQ('z', CharUtils::toLowerCase('z')); + EXPECT_EQ('@', CharUtils::toLowerCase('@')); + EXPECT_EQ(' ', CharUtils::toLowerCase(' ')); + EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, + CharUtils::toLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); + EXPECT_EQ(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, + CharUtils::toLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); + EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, + CharUtils::toLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); + EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, + CharUtils::toLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */)); + EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, + CharUtils::toLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */)); + EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, + CharUtils::toLowerCase(0x3042 /* HIRAGANA LETTER A */)); + EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toLowerCase(0x1F36A /* COOKIE */)); +} + +TEST(CharUtilsTest, TestToBaseLowerCase) { + EXPECT_EQ('a', CharUtils::toBaseLowerCase('A')); + EXPECT_EQ('z', CharUtils::toBaseLowerCase('Z')); + EXPECT_EQ('a', CharUtils::toBaseLowerCase('a')); + EXPECT_EQ('z', CharUtils::toBaseLowerCase('z')); + EXPECT_EQ('@', CharUtils::toBaseLowerCase('@')); + EXPECT_EQ(' ', CharUtils::toBaseLowerCase(' ')); + EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); + EXPECT_EQ('a', CharUtils::toBaseLowerCase(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); + EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, + CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); + EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, + CharUtils::toBaseLowerCase(0x0410 /* CYRILLIC CAPITAL LETTER A */)); + EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, + CharUtils::toBaseLowerCase(0x0430 /* CYRILLIC SMALL LETTER A */)); + EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, + CharUtils::toBaseLowerCase(0x3042 /* HIRAGANA LETTER A */)); + EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseLowerCase(0x1F36A /* COOKIE */)); +} + +TEST(CharUtilsTest, TestToBaseCodePoint) { + EXPECT_EQ('A', CharUtils::toBaseCodePoint('A')); + EXPECT_EQ('Z', CharUtils::toBaseCodePoint('Z')); + EXPECT_EQ('a', CharUtils::toBaseCodePoint('a')); + EXPECT_EQ('z', CharUtils::toBaseCodePoint('z')); + EXPECT_EQ('@', CharUtils::toBaseCodePoint('@')); + EXPECT_EQ(' ', CharUtils::toBaseCodePoint(' ')); + EXPECT_EQ('A', CharUtils::toBaseCodePoint(0x00C0 /* LATIN CAPITAL LETTER A WITH GRAVE */)); + EXPECT_EQ('a', CharUtils::toBaseCodePoint(0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */)); + EXPECT_EQ(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, + CharUtils::toBaseLowerCase(0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */)); + EXPECT_EQ(0x0410 /* CYRILLIC CAPITAL LETTER A */, + CharUtils::toBaseCodePoint(0x0410 /* CYRILLIC CAPITAL LETTER A */)); + EXPECT_EQ(0x0430 /* CYRILLIC SMALL LETTER A */, + CharUtils::toBaseCodePoint(0x0430 /* CYRILLIC SMALL LETTER A */)); + EXPECT_EQ(0x3042 /* HIRAGANA LETTER A */, + CharUtils::toBaseCodePoint(0x3042 /* HIRAGANA LETTER A */)); + EXPECT_EQ(0x1F36A /* COOKIE */, CharUtils::toBaseCodePoint(0x1F36A /* COOKIE */)); +} + +TEST(CharUtilsTest, TestIsIntentionalOmissionCodePoint) { + EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('\'')); + EXPECT_TRUE(CharUtils::isIntentionalOmissionCodePoint('-')); + EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('a')); + EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('?')); + EXPECT_FALSE(CharUtils::isIntentionalOmissionCodePoint('/')); +} + +TEST(CharUtilsTest, TestIsInUnicodeSpace) { + EXPECT_FALSE(CharUtils::isInUnicodeSpace(NOT_A_CODE_POINT)); + EXPECT_FALSE(CharUtils::isInUnicodeSpace(CODE_POINT_BEGINNING_OF_SENTENCE)); + EXPECT_TRUE(CharUtils::isInUnicodeSpace('a')); + EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x0410 /* CYRILLIC CAPITAL LETTER A */)); + EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x3042 /* HIRAGANA LETTER A */)); + EXPECT_TRUE(CharUtils::isInUnicodeSpace(0x1F36A /* COOKIE */)); +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/utils/int_array_view_test.cpp b/app/src/main/jni/tests/utils/int_array_view_test.cpp new file mode 100644 index 000000000..2fce633f5 --- /dev/null +++ b/app/src/main/jni/tests/utils/int_array_view_test.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/int_array_view.h" + +#include + +#include +#include + +namespace latinime { +namespace { + +TEST(IntArrayViewTest, TestAccess) { + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + EXPECT_EQ(intVector.size(), intArrayView.size()); + for (int i = 0; i < static_cast(intVector.size()); ++i) { + EXPECT_EQ(intVector[i], intArrayView[i]); + } +} + +TEST(IntArrayViewTest, TestIteration) { + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + size_t expectedIndex = 0; + for (const int element : intArrayView) { + EXPECT_EQ(intVector[expectedIndex], element); + ++expectedIndex; + } + EXPECT_EQ(expectedIndex, intArrayView.size()); +} + +TEST(IntArrayViewTest, TestConstructFromArray) { + const size_t ARRAY_SIZE = 100; + std::array intArray; + const auto intArrayView = IntArrayView::fromArray(intArray); + EXPECT_EQ(ARRAY_SIZE, intArrayView.size()); +} + +TEST(IntArrayViewTest, TestConstructFromObject) { + const int object = 10; + const auto intArrayView = IntArrayView::singleElementView(&object); + EXPECT_EQ(1u, intArrayView.size()); + EXPECT_EQ(object, intArrayView[0]); +} + +TEST(IntArrayViewTest, TestContains) { + EXPECT_FALSE(IntArrayView().contains(0)); + EXPECT_FALSE(IntArrayView().contains(1)); + + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + EXPECT_TRUE(intArrayView.contains(0)); + EXPECT_TRUE(intArrayView.contains(3)); + EXPECT_TRUE(intArrayView.contains(-2)); + EXPECT_FALSE(intArrayView.contains(-3)); + EXPECT_FALSE(intArrayView.limit(0).contains(3)); +} + +TEST(IntArrayViewTest, TestLimit) { + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + + EXPECT_TRUE(intArrayView.limit(0).empty()); + EXPECT_EQ(intArrayView.size(), intArrayView.limit(intArrayView.size()).size()); + EXPECT_EQ(intArrayView.size(), intArrayView.limit(1000).size()); + + IntArrayView subView = intArrayView.limit(4); + EXPECT_EQ(4u, subView.size()); + for (size_t i = 0; i < subView.size(); ++i) { + EXPECT_EQ(intVector[i], subView[i]); + } +} + +TEST(IntArrayViewTest, TestSkip) { + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + + EXPECT_TRUE(intArrayView.skip(intVector.size()).empty()); + EXPECT_TRUE(intArrayView.skip(intVector.size() + 1).empty()); + EXPECT_EQ(intArrayView.size(), intArrayView.skip(0).size()); + EXPECT_EQ(intArrayView.size(), intArrayView.limit(1000).size()); + + static const size_t SKIP_COUNT = 2; + IntArrayView subView = intArrayView.skip(SKIP_COUNT); + EXPECT_EQ(intVector.size() - SKIP_COUNT, subView.size()); + for (size_t i = 0; i < subView.size(); ++i) { + EXPECT_EQ(intVector[i + SKIP_COUNT], subView[i]); + } +} + +TEST(IntArrayViewTest, TestCopyToArray) { + // "{{" to suppress warning. + std::array buffer = {{10, 20, 30, 40, 50, 60, 70}}; + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + intArrayView.limit(0).copyToArray(&buffer, 0); + EXPECT_EQ(10, buffer[0]); + EXPECT_EQ(20, buffer[1]); + intArrayView.limit(1).copyToArray(&buffer, 0); + EXPECT_EQ(intVector[0], buffer[0]); + EXPECT_EQ(20, buffer[1]); + intArrayView.limit(1).copyToArray(&buffer, 1); + EXPECT_EQ(intVector[0], buffer[0]); + EXPECT_EQ(intVector[0], buffer[1]); + intArrayView.copyToArray(&buffer, 0); + for (size_t i = 0; i < intArrayView.size(); ++i) { + EXPECT_EQ(intVector[i], buffer[i]); + } + EXPECT_EQ(70, buffer[6]); +} + +TEST(IntArrayViewTest, TestFirstOrDefault) { + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + + EXPECT_EQ(3, intArrayView.firstOrDefault(10)); + EXPECT_EQ(10, intArrayView.limit(0).firstOrDefault(10)); + EXPECT_EQ(-10, intArrayView.limit(0).firstOrDefault(-10)); + EXPECT_EQ(10, intArrayView.skip(6).firstOrDefault(10)); +} + +TEST(IntArrayViewTest, TestLastOrDefault) { + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + + EXPECT_EQ(-2, intArrayView.lastOrDefault(10)); + EXPECT_EQ(10, intArrayView.limit(0).lastOrDefault(10)); + EXPECT_EQ(-10, intArrayView.limit(0).lastOrDefault(-10)); + EXPECT_EQ(10, intArrayView.skip(6).lastOrDefault(10)); +} + +TEST(IntArrayViewTest, TestToVector) { + const std::vector intVector = {3, 2, 1, 0, -1, -2}; + IntArrayView intArrayView(intVector); + EXPECT_EQ(intVector, intArrayView.toVector()); + EXPECT_EQ(std::vector(), CodePointArrayView().toVector()); +} + +TEST(IntArrayViewTest, TestSplit) { + EXPECT_TRUE(IntArrayView().split(0, 0).empty()); + { + const auto intArrayViews = IntArrayView().split(0, 1); + EXPECT_EQ(1u, intArrayViews.size()); + EXPECT_TRUE(intArrayViews[0].empty()); + } + { + const auto intArrayViews = IntArrayView().split(0, 100); + EXPECT_EQ(1u, intArrayViews.size()); + EXPECT_TRUE(intArrayViews[0].empty()); + } + + const std::vector intVector = {1, 2, 3, 3, 2, 3}; + const IntArrayView intArrayView(intVector); + { + const auto intArrayViews = intArrayView.split(2); + EXPECT_EQ(3u, intArrayViews.size()); + EXPECT_EQ(std::vector({1}), intArrayViews[0].toVector()); + EXPECT_EQ(std::vector({3, 3}), intArrayViews[1].toVector()); + EXPECT_EQ(std::vector({3}), intArrayViews[2].toVector()); + } + { + const auto intArrayViews = intArrayView.split(2, 2); + EXPECT_EQ(2u, intArrayViews.size()); + EXPECT_EQ(std::vector({1}), intArrayViews[0].toVector()); + EXPECT_EQ(std::vector({3, 3, 2, 3}), intArrayViews[1].toVector()); + } + { + const auto intArrayViews = intArrayView.split(2, 1); + EXPECT_EQ(1u, intArrayViews.size()); + EXPECT_EQ(intVector, intArrayViews[0].toVector()); + } + { + const auto intArrayViews = intArrayView.split(2, 0); + EXPECT_EQ(0u, intArrayViews.size()); + } + { + const auto intArrayViews = intArrayView.split(3); + EXPECT_EQ(4u, intArrayViews.size()); + EXPECT_EQ(std::vector({1, 2}), intArrayViews[0].toVector()); + EXPECT_EQ(std::vector(), intArrayViews[1].toVector()); + EXPECT_EQ(std::vector({2}), intArrayViews[2].toVector()); + EXPECT_EQ(std::vector(), intArrayViews[3].toVector()); + } +} + +} // namespace +} // namespace latinime diff --git a/app/src/main/jni/tests/utils/time_keeper_test.cpp b/app/src/main/jni/tests/utils/time_keeper_test.cpp new file mode 100644 index 000000000..3f54b91f1 --- /dev/null +++ b/app/src/main/jni/tests/utils/time_keeper_test.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/time_keeper.h" + +#include + +namespace latinime { +namespace { + +TEST(TimeKeeperTest, TestTestMode) { + TimeKeeper::setCurrentTime(); + const int startTime = TimeKeeper::peekCurrentTime(); + static const int TEST_CURRENT_TIME = 100; + TimeKeeper::startTestModeWithForceCurrentTime(TEST_CURRENT_TIME); + EXPECT_EQ(TEST_CURRENT_TIME, TimeKeeper::peekCurrentTime()); + TimeKeeper::setCurrentTime(); + EXPECT_EQ(TEST_CURRENT_TIME, TimeKeeper::peekCurrentTime()); + TimeKeeper::stopTestMode(); + TimeKeeper::setCurrentTime(); + EXPECT_LE(startTime, TimeKeeper::peekCurrentTime()); +} + +} // namespace +} // namespace latinime diff --git a/detekt.yml b/detekt.yml index 1b22e9337..1e3d943e1 100644 --- a/detekt.yml +++ b/detekt.yml @@ -1,6 +1,12 @@ build: excludes: - "**/be/scri/views/MyRecyclerView.kt" + - "**/be/scri/helpers/data/AutoSuggestionDataManager.kt" + - "**/be/scri/helpers/SuggestionHandler.kt" + - "**/be/scri/helpers/NativeSuggestionEngine.kt" + - "**/be/scri/latin/**" + - "**/be/scri/latin/**/*.kt" + - "**/latin/**" style: MagicNumber: