Я сравнил несколько вариантов: Contains, SqliteFTS, Words. И выбрал реализацию Words.
Псевдо-C#:
var strings = new string[]; //25 млн записей
var words = new Dictionary<string,HashSet<int>>();
//формирование "справочника"
var str = string.Empty;
for(var stringId = strings.Length - 1; stringId >= 0; stringId--)
{
str = strings[stringId];
var stringWords = NormalizeString(str).Split(' ');
foreach(var stringWord in stringWords )
{
words[stringWord].Add(stringId);
}
}
//поиск
var searchTermWords= NormilizeString(searchTerm).Split(' ')
var foundIds = new HashSet<int>();
foreach(var searchTermWord in searchTermWords)
{
foreach(var matchWord in words.Keys.Where(x => x.Contains(searcgTermWord)))
{
if (words.TryGetValue(matchWord, out var stringIds))
{
if (foundIds == null)
{
foundIds = stringIds;
}
else
{
foundIds = stringIds.Where(x => foundIds .Contains(x)).ToHashSet();
}
}
else
{
foundIds = null;
}
}
}
Console.WriteLine($"Найдено строк: {foundIds.Count} ");
Тесты разных вариантов для списка объектов с 4мя строковыми полями:
Поиск: 100 циклов поиска 1-3 символьной подстроки по одному полю
records: ~5 000 000
TestContains (ms):
-> Max: 434, Avg: 295.56, Median: 281
TestSqliteFTS (ms):
CREATE -> 111
INSERT DATA -> 34697 //INSERT INTO temp_table(object_id, поле0, поле1, поле2, поле3)
INSERT INDEX -> 161683 // INSERT INTO fts_index(object_id, поле0, поле1, поле2, поле3 ) SELECT * FROM temp_table
DROP DATA -> 1191
VACUUM -> 15849
-> sqlite.db (FTS5: 1.6GB, tokenize = 'trigram', content='',columnsize=0, detail='column')
-> Max: 10, Avg: 1.16, Median: 0
TestWords (ms):
CREATE -> 89
INSERT DATA -> 98851 //INSERT INTO temp_table(word_id, object_id)
AGGREGATE DATA -> 28504 //SELET object_id FROM temp_table WHERE @word_id -> FastPFor -> INSERT word_id, object_ids_as_bytes
DROP DATA -> 1360
CREATE INDICES-> 9
VACUUM -> 262
-> sqlite.db (CUSTOM: 9.5MB, (tbl: fields -> id, value), (tbl: words -> id, field_id, value), (tbl: data -> word_id INTEGER, integersQty INTEGER, bytes BLOB))
-> Max: 128, Avg: 18.78, Median: 1
Статы:
field_id wordsQty maxRefsQty avgRefsQty maxRefsBytes avgRefsBytes
0 24075 6461929 271 910000 52
1 5339 23858735 4515 3336816 667
2 3602 6766040 1913 952808 295
3 11825 7595099 744 1069508 123
records: ~25 000 000
TestContains (ms):
-> Max: 2568, Avg: 1524.47, Median: 1437.5
TestSqliteFTS (ms):
CREATE -> 135
INSERT DATA -> 255882 //INSERT INTO temp_table(object_id, поле0, поле1, поле2, поле3)
INSERT INDEX -> 1022499 // INSERT INTO fts_index(object_id, поле0, поле1, поле2, поле3 ) SELECT * FROM temp_table
DROP DATA -> 370118
VACUUM -> 1230845
-> sqlite.db (FTS5: 8.1GB, tokenize = 'trigram', content='',columnsize=0, detail='column')
-> Max: 587, Avg: 11.53, Median: 0
TestWords (ms):
CREATE -> 107
INSERT DATA -> 581050 //INSERT INTO temp_table(word_id, object_id)
AGGREGATE DATA -> 132700 //SELET object_id FROM temp_table WHERE @word_id -> FastPFor -> INSERT word_id, object_ids_as_bytes
DROP DATA -> 6855
CREATE INDICES-> 32
VACUUM -> 1161
-> sqlite.db (CUSTOM: 35MB, (tbl: fields -> id, value), (tbl: words -> id, field_id, value), (tbl: data -> word_id INTEGER, integersQty INTEGER, bytes BLOB))
-> Max: 492, Avg: 64,87, Median: 1
Статы:
field_id wordsQty maxRefsQty avgRefsQty maxRefsBytes avgRefsBytes
0 24075 32570729 1355 4586324 205
1 5339 120257135 22577 16818780 3192
2 3602 34103240 9566 4802092 1372
3 11825 38282299 3723 5390372 542
P.S. После тестирования FastPFor, WAH, RoamingBitmap и LZO для хранения индексов (слово -> индекс строки[]) остановился на Delta + LZO. Итоговый размер индекса: 17MB. Максимальное время поиска 600ms, среднее 7ms.