最近由于工作原因,一直忙于公司的各种项目(大部份都是基于spring cloud的微服务项目),故有一段时间没有与大家分享总结最近的技术研究成果的,其实最近我一直在不断的深入研究学习spring、spring boot、spring cloud的各种框架原理,同时也随时关注着.net core的发展情况及最新技术点,也在极客时间上订阅相关的专栏,只要下班有空我都会去认真阅读观看,纸质书箱也买了一些,总之近一年都是在通过:微信技术公众号(.net、java、算法、前端等技术方向)、极客时间、技术书箱 不断的吸取、借鉴他人之精华,从而不断的充实提高自己的技术水平,所谓:学如逆水行舟,不进则退,工作中学习,学习后工作中运用,当然写文章分享是一种总结,同时也是“温故而知新”的最佳应用。
前面废话说得有点多了,就直奔本文的主题内容,编写一个基于lucene.net的搜索引擎查询通用工具类:searchengineutil,lucene是什么,见百度百科 ,重点是:lucene是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,lucene.net是c#及.net运行时下的另一种语言的实现,官网地址: ,具体用法就不多说了,官网以及网上都有很多,但由于lucene.net的原生sdk中的api比较复杂,用起来不太方便,故我进行了适当的封装,把常用的增、删、改、查(分页查)在保证灵活度的情况下进行了封装,使得操作lucene.net变得相对简单一些,代码本身也不复杂,贴出完整的searchengineutil代码如下:
using lucene.net.analysis.pangu; using lucene.net.documents; using lucene.net.index; using lucene.net.queryparsers; using lucene.net.search; using lucene.net.store; using nlog; using pangu; using pangu.highlight; using system; using system.collections.generic; using system.io; using system.linq; using system.reflection; using system.text; namespace cn.zuowenjun.blog.common { /// <summary> /// lucene 搜索引擎实用工具类 /// author:zuowenjun /// </summary> public class searchengineutil { /// <summary> /// 创建并添加索引记录 /// </summary> /// <typeparam name="tindex"></typeparam> /// <param name="indexdir"></param> /// <param name="indexdata"></param> /// <param name="setdocfiledsaction"></param> public static void addindex<tindex>(string indexdir, tindex indexdata, action<document, tindex> setdocfiledsaction) { //创建索引目录 if (!system.io.directory.exists(indexdir)) { system.io.directory.createdirectory(indexdir); } fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory()); bool isupdate = indexreader.indexexists(directory); if (isupdate) { //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁 if (indexwriter.islocked(directory)) { indexwriter.unlock(directory); } } using (indexwriter writer = new indexwriter(directory, new panguanalyzer(), !isupdate, indexwriter.maxfieldlength.unlimited)) { document document = new document(); setdocfiledsaction(document, indexdata); writer.adddocument(document); writer.optimize();//优化索引 } } /// <summary> /// 删除索引记录 /// </summary> /// <param name="indexdir"></param> /// <param name="keyfiledname"></param> /// <param name="keyfilevalue"></param> public static void deleteindex(string indexdir, string keyfiledname, object keyfilevalue) { fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory()); if (!indexreader.indexexists(directory)) { return; } using (indexwriter iw = new indexwriter(directory, new panguanalyzer(), indexwriter.maxfieldlength.unlimited)) { iw.deletedocuments(new term(keyfiledname, keyfilevalue.tostring())); iw.optimize();//删除文件后并非从磁盘中移除,而是生成一个.del的文件,需要调用optimize方法来清除。在清除文件前可以使用undeleteall方法恢复 } } /// <summary> /// 更新索引记录 /// </summary> /// <param name="indexdir"></param> /// <param name="keyfiledname"></param> /// <param name="keyfilevalue"></param> /// <param name="doc"></param> public static void updateindex(string indexdir, string keyfiledname, object keyfilevalue, document doc) { fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory()); if (!indexreader.indexexists(directory)) { return; } using (indexwriter iw = new indexwriter(directory, new panguanalyzer(), indexwriter.maxfieldlength.unlimited)) { iw.updatedocument(new term(keyfiledname, keyfilevalue.tostring()), doc); iw.optimize(); } } /// <summary> /// 是否存在指定的索引文档 /// </summary> /// <param name="indexdir"></param> /// <param name="keyfiledname"></param> /// <param name="keyfilevalue"></param> /// <returns></returns> public static bool existsdocument(string indexdir, string keyfiledname, object keyfilevalue) { fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory()); if (!indexreader.indexexists(directory)) { return false; } var reader = indexreader.open(directory, true); return reader.docfreq(new term(keyfiledname, keyfilevalue.tostring())) > 0; } /// <summary> /// 查询索引匹配到的记录 /// </summary> /// <typeparam name="tresult"></typeparam> /// <param name="indexdir"></param> /// <param name="buildqueryaction"></param> /// <param name="getsortfieldsfunc"></param> /// <param name="buildresultfunc"></param> /// <param name="topcount"></param> /// <param name="needhighlight"></param> /// <returns></returns> public static list<tresult> searchindex<tresult>(string indexdir, func<booleanquery, idictionary<string, string>> buildqueryaction, func<ienumerable<sortfield>> getsortfieldsfunc, func<document, tresult> buildresultfunc, bool needhighlight = true, int topcount = 0) { fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nolockfactory()); if (!indexreader.indexexists(directory)) { return new list<tresult>(); } indexreader reader = indexreader.open(directory, true); indexsearcher searcher = new indexsearcher(reader); booleanquery bquery = new booleanquery(); var keywords = buildqueryaction(bquery); sort sort = null; var sortfields = getsortfieldsfunc(); if (sortfields != null) { sort = new sort(); sort.setsort(sortfields.toarray()); } topcount = topcount > 0 ? topcount : int.maxvalue;//当未指定top值,则设置最大值以表示获取全部 topdocs resultdocs = null; if (sort != null) { resultdocs = searcher.search(bquery, null, topcount, sort); } else { resultdocs = searcher.search(bquery, null, topcount); } if (topcount > resultdocs.totalhits) { topcount = resultdocs.totalhits; } dictionary<string, propertyinfo> highlightprops = null; list<tresult> results = new list<tresult>(); if (resultdocs != null) { for (int i = 0; i < topcount; i++) { document doc = searcher.doc(resultdocs.scoredocs[i].doc); var model = buildresultfunc(doc); if (needhighlight) { model = sethighlighter(keywords, model, ref highlightprops); } results.add(model); } } return results; } /// <summary> /// 分页查询索引匹配到的记录 /// </summary> /// <typeparam name="tresult"></typeparam> /// <param name="indexdir"></param> /// <param name="buildqueryaction"></param> /// <param name="getsortfieldsfunc"></param> /// <param name="buildresultfunc"></param> /// <param name="pagesize"></param> /// <param name="page"></param> /// <param name="totalcount"></param> /// <param name="needhighlight"></param> /// <returns></returns> public static list<tresult> searchindexbypage<tresult>(string indexdir, func<booleanquery, idictionary<string, string>> buildqueryaction, func<ienumerable<sortfield>> getsortfieldsfunc, func<document, tresult> buildresultfunc, int pagesize, int page, out int totalcount, bool needhighlight = true) { fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nolockfactory()); if (!indexreader.indexexists(directory)) { totalcount = 0; return new list<tresult>(); } indexreader reader = indexreader.open(directory, true); indexsearcher searcher = new indexsearcher(reader); booleanquery bquery = new booleanquery(); var keywords = buildqueryaction(bquery); sort sort = null; var sortfields = getsortfieldsfunc(); if (sortfields != null) { sort = new sort(); sort.setsort(sortfields.toarray()); } topscoredoccollector doccollector = topscoredoccollector.create(1, true); searcher.search(bquery, doccollector); totalcount = doccollector.totalhits; if (totalcount <= 0) return null; topdocs resultdocs = searcher.search(bquery, null, pagesize * page, sort); dictionary<string, propertyinfo> highlightprops = null; list<tresult> results = new list<tresult>(); int indexstart = (page - 1) * pagesize; int indexend = indexstart + pagesize; if (totalcount < indexend) indexend = totalcount; if (resultdocs != null) { for (int i = indexstart; i < indexend; i++) { document doc = searcher.doc(resultdocs.scoredocs[i].doc); var model = buildresultfunc(doc); if (needhighlight) { model = sethighlighter(keywords, model, ref highlightprops); } results.add(model); } } return results; } /// <summary> /// 设置结果高亮 /// </summary> /// <typeparam name="t"></typeparam> /// <param name="dickeywords"></param> /// <param name="model"></param> /// <param name="props"></param> /// <returns></returns> private static t sethighlighter<t>(idictionary<string, string> dickeywords, t model, ref dictionary<string, propertyinfo> props) { simplehtmlformatter simplehtmlformatter = new simplehtmlformatter("<font color=\"red\">", "</font>"); highlighter highlighter = new highlighter(simplehtmlformatter, new segment()); highlighter.fragmentsize = 250; type modeltype = typeof(t); foreach (var item in dickeywords) { if (!string.isnullorwhitespace(item.value)) { if (props == null) { props = new dictionary<string, propertyinfo>(); } if (!props.containskey(item.key)) { props[item.key] = modeltype.getproperty(item.key, bindingflags.ignorecase | bindingflags.public | bindingflags.instance); } var modelprop = props[item.key]; if (modelprop.propertytype == typeof(string)) { string newvalue = highlighter.getbestfragment(item.value, modelprop.getvalue(model).tostring()); if (!string.isnullorempty(newvalue)) { modelprop.setvalue(model, newvalue); } } } } return model; } /// <summary> /// 拆分关键词 /// </summary> /// <param name="keywords"></param> /// <returns></returns> public static string getkeywordssplitbyspace(string keyword) { pangutokenizer kttokenizer = new pangutokenizer(); stringbuilder result = new stringbuilder(); icollection<wordinfo> words = kttokenizer.segmenttowordinfos(keyword); foreach (wordinfo word in words) { if (word == null) { continue; } result.appendformat("{0}^{1}.0 ", word.word, (int)math.pow(3, word.rank)); } return result.tostring().trim(); } /// <summary> /// 【辅助方法】创建盘古查询对象 /// </summary> /// <param name="field"></param> /// <param name="keyword"></param> /// <returns></returns> public static query createpanguquery(string field, string keyword, bool needsplit = true) { if (needsplit) { keyword = getkeywordssplitbyspace(keyword); } queryparser parse = new queryparser(lucene.net.util.version.lucene_30, field, new panguanalyzer()); parse.defaultoperator = queryparser.operator.or; query query = parse.parse(keyword); return query; } /// <summary> /// 【辅助方法】创建盘古多字段查询对象 /// </summary> /// <param name="keyword"></param> /// <param name="fields"></param> /// <returns></returns> public static query createpangumultifieldquery(string keyword, bool needsplit, params string[] fields) { if (needsplit) { keyword = getkeywordssplitbyspace(keyword); } queryparser parse = new multifieldqueryparser(lucene.net.util.version.lucene_30, fields, new panguanalyzer()); parse.defaultoperator = queryparser.operator.or; query query = parse.parse(keyword); return query; } } }
里面除了使用了lucene.net nuget包,还单独引用了pangu分词器及其相关组件,因为大多数情况下我们的内容会包含中文。如上代码就不再细讲了,注释得比较清楚了。下面贴出一些实际的用法:
创建索引:
searchengineutil.addindex(getsearchindexdir(), post, (doc, data) => buildpostsearchdocument(data, doc)); private document buildpostsearchdocument(post post, document doc = null) { if (doc == null) { doc = new document();//创建document } doc.add(new field("id", post.id.tostring(), field.store.yes, field.index.not_analyzed)); doc.add(new field("title", post.title, field.store.yes, field.index.analyzed)); doc.add(new field("summary", post.summary, field.store.yes, field.index.analyzed)); doc.add(new field("createtime", post.createtime.tostring("yyyy/mm/dd hh:mm"), field.store.yes, field.index.no)); doc.add(new field("author", post.isoriginal ? (post.creator ?? userqueryservice.findbyname(post.createby)).nickname : post.sourceby, field.store.yes, field.index.no)); return doc; }
删除索引:
searchengineutil.deleteindex(getsearchindexdir(), "id", post.id);
更新索引:
searchengineutil.updateindex(getsearchindexdir(), "id", post.id, buildpostsearchdocument(post));
分页查询:
var keyword = searchengineutil.getkeywordssplitbyspace("梦在旅途 中国梦"); var searchresult = searchengineutil.searchindexbypage(indexdir, (bquery) => { var query = searchengineutil.createpangumultifieldquery(keyword, false, "title", "summary"); bquery.add(query, occur.should); return new dictionary<string, string> { { "title",keyword},{"summary",keyword} }; }, () => { return new[] { new sortfield("id", sortfield.int, true) }; }, doc => { return new postsearchinfodto { id = doc.get("id"), title = doc.get("title"), summary = doc.get("summary"), author = doc.get("author"), createtime = doc.get("createtime") }; }, pagesize, pageno, out totalcount);
其它的还有:判断索引中的指定文档记录存不存在、查询符合条件的索引文档等在此没有列出,大家有兴趣的可以copy到自己的项目中测试一下。
这里可以看一下我在自己的项目中(个人全新改版的自己博客,还在开发中)应用搜索场景的效果:
最后说明的是:lucene并不是一个完整的全文检索引擎,但了解它对于学习elasticsearch、solr还是有一定的帮助,目前一般应用于实际的生产项目中,多半是使用更高层的elasticsearch、solr。
(本文中的代码我是今年很早前就写好了,只是今天才分享出来)
我喜欢对一些常用的组件进行封装,比如过往封装有:
基于mongodb官方c#驱动封装mongodbcsharphelper类(crud类)
基于rabbitmq.client组件实现rabbitmq可复用的 connectionpool(连接池)