假设我们有100万行像这样:
import sqlite3 db = sqlite3.connect(':memory:') c = db.cursor() c.execute('CREATE TABLE mytable (id integer, description text)') c.execute('INSERT INTO mytable VALUES (1, "Riemann")') c.execute('INSERT INTO mytable VALUES (2, "All the Carmichael numbers")')
我知道如何用Sqlite做到这一点:
找到一个单字查询的行,最多有一些拼写错误与spellfix
模块和Levenshtein距离(我在这里发布了一个详细的答案,关于如何编译它,如何使用它,...):
db.enable_load_extension(True) db.load_extension('./spellfix') c.execute('SELECT * FROM mytable WHERE editdist3(description, "Riehmand") <300'); print c.fetchall() #Query: 'Riehmand' #Answer: [(1, u'Riemann')]
使用1M行,这将是超级慢!如此处详述,postgresql
可能会对此进行优化trigrams
.Sqlite提供的快速解决方案是使用VIRTUAL TABLE USING spellfix
:
c.execute('CREATE VIRTUAL TABLE mytable3 USING spellfix1') c.execute('INSERT INTO mytable3(word) VALUES ("Riemann")') c.execute('SELECT * FROM mytable3 WHERE word MATCH "Riehmand"'); print c.fetchall() #Query: 'Riehmand' #Answer: [(u'Riemann', 1, 76, 0, 107, 7)], working!
查找带有与FTS 匹配的一个或多个单词的查询的表达式("全文搜索"):
c.execute('CREATE VIRTUAL TABLE mytable2 USING fts4(id integer, description text)') c.execute('INSERT INTO mytable2 VALUES (2, "All the Carmichael numbers")') c.execute('SELECT * FROM mytable2 WHERE description MATCH "NUMBERS carmichael"'); print c.fetchall() #Query: 'NUMBERS carmichael' #Answer: [(2, u'All the Carmichael numbers')]
它不区分大小写,你甚至可以使用错误顺序的两个单词的查询等:FTS确实非常强大.但缺点是每个查询关键字必须正确拼写,即单独的FTS不允许拼写错误.
如何使用Sqlite进行全文搜索(FTS)并且还允许拼写错误?即"FTS + spellfix"在一起
例:
DB中的行: "All the Carmichael numbers"
查询:"NUMMBER carmickaeel"
应该匹配!
如何用Sqlite做到这一点?
由于此页面指出了以下内容,因此可能使用Sqlite :
或者,它[spellfix]可以与FTS4一起使用,使用可能拼写错误的单词进行全文搜索.
链接问题:与Python + Sqlite的字符串相似性(Levenshtein距离/编辑距离)
该spellfix1
文件实际上告诉你如何做到这一点.从概述部分:
如果您打算与FTS4表一起使用此虚拟表(用于搜索术语的拼写更正),那么您可以使用fts4aux表提取词汇表:
INSERT INTO demo(word) SELECT term FROM search_aux WHERE col='*';
该SELECT term from search_aux WHERE col='*'
语句提取所有索引标记.
将此连接到您的示例,mytable2
您的fts4虚拟表在哪里,您可以创建一个fts4aux
表并将这些令牌插入到mytable3
spellfix1表中,其中包括:
CREATE VIRTUAL TABLE mytable2_terms USING fts4aux(mytable2); INSERT INTO mytable3(word) SELECT term FROM mytable2_terms WHERE col='*';
您可能希望进一步限定该查询以跳过已插入spellfix1的任何术语,否则您最终会获得双重条目:
INSERT INTO mytable3(word) SELECT term FROM mytable2_terms WHERE col='*' AND term not in (SELECT word from mytable3_vocab);
现在,您可以使用mytable3
将拼写错误的单词映射到更正的标记,然后在MATCH
查询中再次使用这些更正的标记mytable2
.
根据您的neads,这可能意味着您需要自己进行令牌处理和查询构建; 没有暴露的fts4查询语法解析器.因此,您需要拆分双令牌搜索字符串,每个令牌都通过spellfix1
表格运行以映射到现有令牌,然后将这些令牌传递给fts4查询.
忽略SQL语法来处理这个问题,使用Python进行拆分很容易:
def spellcheck_terms(conn, terms): cursor = conn.cursor() base_spellfix = """ SELECT :term{0} as term, word FROM spellfix1data WHERE word MATCH :term{0} and top=1 """ terms = terms.split() params = {"term{}".format(i): t for i, t in enumerate(terms, 1)} query = " UNION ".join([ base_spellfix.format(i + 1) for i in range(len(params))]) cursor.execute(query, params) correction_map = dict(cursor) return " ".join([correction_map.get(t, t) for t in terms]) def spellchecked_search(conn, terms): corrected_terms = spellcheck_terms(conn, terms) cursor = conn.cursor() fts_query = 'SELECT * FROM mytable2 WHERE mytable2 MATCH ?' cursor.execute(fts_query, (corrected_terms,)) return cursor.fetchall()
这然后返回[('All the Carmichael numbers',)]
了spellchecked_search(db, "NUMMBER carmickaeel")
.
保持Python中的拼写检查处理允许您根据需要支持更复杂的FTS查询; 您可能必须重新实现表达式解析器才能这样做,但至少Python为您提供了执行此操作的工具.
一个完整的例子,在一个类中打包上述方法,它简单地将术语提取为字母数字字符序列(通过我阅读表达式语法规范,就足够了):
import re import sqlite3 import sys class FTS4SpellfixSearch(object): def __init__(self, conn, spellfix1_path): self.cOnn= conn self.conn.enable_load_extension(True) self.conn.load_extension(spellfix1_path) def create_schema(self): self.conn.executescript( """ CREATE VIRTUAL TABLE IF NOT EXISTS fts4data USING fts4(description text); CREATE VIRTUAL TABLE IF NOT EXISTS fts4data_terms USING fts4aux(fts4data); CREATE VIRTUAL TABLE IF NOT EXISTS spellfix1data USING spellfix1; """ ) def index_text(self, *text): cursor = self.conn.cursor() with self.conn: params = ((t,) for t in text) cursor.executemany("INSERT INTO fts4data VALUES (?)", params) cursor.execute( """ INSERT INTO spellfix1data(word) SELECT term FROM fts4data_terms WHERE col='*' AND term not in (SELECT word from spellfix1data_vocab) """ ) # fts3 / 4 search expression tokenizer # no attempt is made to validate the expression, only # to identify valid search terms and extract them. # the fts3/4 tokenizer considers any alphanumeric ASCII character # and character in the range U+0080 and over to be terms. if sys.maxunicode == 0xFFFF: # UCS2 build, keep it simple, match any UTF-16 codepoint 0080 and over _fts4_expr_terms = re.compile(u"[a-zA-Z0-9\u0080-\uffff]+") else: # UCS4 _fts4_expr_terms = re.compile(u"[a-zA-Z0-9\u0080-\U0010FFFF]+") def _terms_from_query(self, search_query): """Extract search terms from a fts3/4 query Returns a list of terms and a template such that template.format(*terms) reconstructs the original query. terms using partial* syntax are ignored, as you can't distinguish between a misspelled prefix search that happens to match existing tokens and a valid spelling that happens to have 'near' tokens in the spellfix1 database that would not otherwise be matched by fts4 """ template, terms, lastpos = [], [], 0 for match in self._fts4_expr_terms.finditer(search_query): token, (start, end) = match.group(), match.span() # skip columnname: and partial* terms by checking next character ismeta = search_query[end:end + 1] in {":", "*"} # skip digits if preceded by "NEAR/" ismeta = ismeta or ( token.isdigit() and template and template[-1] == "NEAR" and "/" in search_query[lastpos:start]) if token not in {"AND", "OR", "NOT", "NEAR"} and not ismeta: # full search term, not a keyword, column name or partial* terms.append(token) token = "{}" template += search_query[lastpos:start], token lastpos = end template.append(search_query[lastpos:]) return terms, "".join(template) def spellcheck_terms(self, search_query): cursor = self.conn.cursor() base_spellfix = """ SELECT :term{0} as term, word FROM spellfix1data WHERE word MATCH :term{0} and top=1 """ terms, template = self._terms_from_query(search_query) params = {"term{}".format(i): t for i, t in enumerate(terms, 1)} query = " UNION ".join( [base_spellfix.format(i + 1) for i in range(len(params))] ) cursor.execute(query, params) correction_map = dict(cursor) return template.format(*(correction_map.get(t, t) for t in terms)) def search(self, search_query): corrected_query = self.spellcheck_terms(search_query) cursor = self.conn.cursor() fts_query = "SELECT * FROM fts4data WHERE fts4data MATCH ?" cursor.execute(fts_query, (corrected_query,)) return { "terms": search_query, "corrected": corrected_query, "results": cursor.fetchall(), }
和使用该类的交互式演示:
>>> db = sqlite3.connect(":memory:") >>> fts = FTS4SpellfixSearch(db, './spellfix') >>> fts.create_schema() >>> fts.index_text("All the Carmichael numbers") # your example >>> from pprint import pprint >>> pprint(fts.search('NUMMBER carmickaeel')) {'corrected': 'numbers carmichael', 'results': [('All the Carmichael numbers',)], 'terms': 'NUMMBER carmickaeel'} >>> fts.index_text( ... "They are great", ... "Here some other numbers", ... ) >>> pprint(fts.search('here some')) # edgecase, multiple spellfix matches {'corrected': 'here some', 'results': [('Here some other numbers',)], 'terms': 'here some'} >>> pprint(fts.search('NUMMBER NOT carmickaeel')) # using fts4 query syntax {'corrected': 'numbers NOT carmichael', 'results': [('Here some other numbers',)], 'terms': 'NUMMBER NOT carmickaeel'}