C#编写了一个基于Lucene.Net的搜索引擎查询通用工具类:SearchEngineUtil

2022-10-14,,,,

  最近由于工作原因,一直忙于公司的各种项目(大部份都是基于spring cloud的微服务项目),故有一段时间没有与大家分享总结最近的技术研究成果的,其实最近我一直在不断的深入研究学习spring、spring boot、spring cloud的各种框架原理,同时也随时关注着.net core的发展情况及最新技术点,也在极客时间上订阅相关的专栏,只要下班有空我都会去认真阅读观看,纸质书箱也买了一些,总之近一年都是在通过:微信技术公众号(.net、java、算法、前端等技术方向)、极客时间、技术书箱 不断的吸取、借鉴他人之精华,从而不断的充实提高自己的技术水平,所谓:学如逆水行舟,不进则退,工作中学习,学习后工作中运用,当然写文章分享是一种总结,同时也是“温故而知新”的最佳应用。

  前面废话说得有点多了,就直奔本文的主题内容,编写一个基于lucene.net的搜索引擎查询通用工具类:searchengineutil,lucene是什么,见百度百科 ,重点是:lucene是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,lucene.net是c#及.net运行时下的另一种语言的实现,官网地址:  ,具体用法就不多说了,官网以及网上都有很多,但由于lucene.net的原生sdk中的api比较复杂,用起来不太方便,故我进行了适当的封装,把常用的增、删、改、查(分页查)在保证灵活度的情况下进行了封装,使得操作lucene.net变得相对简单一些,代码本身也不复杂,贴出完整的searchengineutil代码如下:

using lucene.net.analysis.pangu;
using lucene.net.documents;
using lucene.net.index;
using lucene.net.queryparsers;
using lucene.net.search;
using lucene.net.store;
using nlog;
using pangu;
using pangu.highlight;
using system;
using system.collections.generic;
using system.io;
using system.linq;
using system.reflection;
using system.text;

namespace cn.zuowenjun.blog.common
{
    /// <summary>
    /// lucene 搜索引擎实用工具类
    /// author:zuowenjun
    /// </summary>
    public class searchengineutil
    {

        /// <summary>
        /// 创建并添加索引记录
        /// </summary>
        /// <typeparam name="tindex"></typeparam>
        /// <param name="indexdir"></param>
        /// <param name="indexdata"></param>
        /// <param name="setdocfiledsaction"></param>
        public static void addindex<tindex>(string indexdir, tindex indexdata, action<document, tindex> setdocfiledsaction)
        {
            //创建索引目录
            if (!system.io.directory.exists(indexdir))
            {
                system.io.directory.createdirectory(indexdir);
            }
            fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
            bool isupdate = indexreader.indexexists(directory);
            if (isupdate)
            {
                //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
                if (indexwriter.islocked(directory))
                {
                    indexwriter.unlock(directory);
                }
            }
            using (indexwriter writer = new indexwriter(directory, new panguanalyzer(), !isupdate, indexwriter.maxfieldlength.unlimited))
            {
                document document = new document();

                setdocfiledsaction(document, indexdata);

                writer.adddocument(document);

                writer.optimize();//优化索引
            }
        }

        /// <summary>
        /// 删除索引记录
        /// </summary>
        /// <param name="indexdir"></param>
        /// <param name="keyfiledname"></param>
        /// <param name="keyfilevalue"></param>
        public static void deleteindex(string indexdir, string keyfiledname, object keyfilevalue)
        {
            fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
            if (!indexreader.indexexists(directory))
            {
                return;
            }

            using (indexwriter iw = new indexwriter(directory, new panguanalyzer(), indexwriter.maxfieldlength.unlimited))
            {
                iw.deletedocuments(new term(keyfiledname, keyfilevalue.tostring()));
                iw.optimize();//删除文件后并非从磁盘中移除,而是生成一个.del的文件,需要调用optimize方法来清除。在清除文件前可以使用undeleteall方法恢复
            }
        }

        /// <summary>
        /// 更新索引记录
        /// </summary>
        /// <param name="indexdir"></param>
        /// <param name="keyfiledname"></param>
        /// <param name="keyfilevalue"></param>
        /// <param name="doc"></param>
        public static void updateindex(string indexdir, string keyfiledname, object keyfilevalue, document doc)
        {
            fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
            if (!indexreader.indexexists(directory))
            {
                return;
            }

            using (indexwriter iw = new indexwriter(directory, new panguanalyzer(), indexwriter.maxfieldlength.unlimited))
            {
                iw.updatedocument(new term(keyfiledname, keyfilevalue.tostring()), doc);
                iw.optimize();
            }
        }

        /// <summary>
        /// 是否存在指定的索引文档
        /// </summary>
        /// <param name="indexdir"></param>
        /// <param name="keyfiledname"></param>
        /// <param name="keyfilevalue"></param>
        /// <returns></returns>
        public static bool existsdocument(string indexdir, string keyfiledname, object keyfilevalue)
        {
            fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nativefslockfactory());
            if (!indexreader.indexexists(directory))
            {
                return false;
            }

            var reader = indexreader.open(directory, true);

            return reader.docfreq(new term(keyfiledname, keyfilevalue.tostring())) > 0;
        }

        /// <summary>
        /// 查询索引匹配到的记录
        /// </summary>
        /// <typeparam name="tresult"></typeparam>
        /// <param name="indexdir"></param>
        /// <param name="buildqueryaction"></param>
        /// <param name="getsortfieldsfunc"></param>
        /// <param name="buildresultfunc"></param>
        /// <param name="topcount"></param>
        /// <param name="needhighlight"></param>
        /// <returns></returns>
        public static list<tresult> searchindex<tresult>(string indexdir, func<booleanquery, idictionary<string, string>> buildqueryaction,
            func<ienumerable<sortfield>> getsortfieldsfunc, func<document, tresult> buildresultfunc, bool needhighlight = true, int topcount = 0)
        {
            fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nolockfactory());

            if (!indexreader.indexexists(directory))
            {
                return new list<tresult>();
            }

            indexreader reader = indexreader.open(directory, true);
            indexsearcher searcher = new indexsearcher(reader);

            booleanquery bquery = new booleanquery();
            var keywords = buildqueryaction(bquery);

            sort sort = null;
            var sortfields = getsortfieldsfunc();
            if (sortfields != null)
            {
                sort = new sort();
                sort.setsort(sortfields.toarray());
            }

            topcount = topcount > 0 ? topcount : int.maxvalue;//当未指定top值,则设置最大值以表示获取全部
            topdocs resultdocs = null;
            if (sort != null)
            {
                resultdocs = searcher.search(bquery, null, topcount, sort);
            }
            else
            {
                resultdocs = searcher.search(bquery, null, topcount);
            }

            if (topcount > resultdocs.totalhits)
            {
                topcount = resultdocs.totalhits;
            }

            dictionary<string, propertyinfo> highlightprops = null;
            list<tresult> results = new list<tresult>();
            if (resultdocs != null)
            {
                for (int i = 0; i < topcount; i++)
                {
                    document doc = searcher.doc(resultdocs.scoredocs[i].doc);
                    var model = buildresultfunc(doc);
                    if (needhighlight)
                    {
                        model = sethighlighter(keywords, model, ref highlightprops);
                    }

                    results.add(model);
                }
            }

            return results;

        }

        /// <summary>
        /// 分页查询索引匹配到的记录
        /// </summary>
        /// <typeparam name="tresult"></typeparam>
        /// <param name="indexdir"></param>
        /// <param name="buildqueryaction"></param>
        /// <param name="getsortfieldsfunc"></param>
        /// <param name="buildresultfunc"></param>
        /// <param name="pagesize"></param>
        /// <param name="page"></param>
        /// <param name="totalcount"></param>
        /// <param name="needhighlight"></param>
        /// <returns></returns>
        public static list<tresult> searchindexbypage<tresult>(string indexdir, func<booleanquery, idictionary<string, string>> buildqueryaction,
            func<ienumerable<sortfield>> getsortfieldsfunc, func<document, tresult> buildresultfunc, int pagesize, int page, out int totalcount, bool needhighlight = true)
        {
            fsdirectory directory = fsdirectory.open(new directoryinfo(indexdir), new nolockfactory());

            if (!indexreader.indexexists(directory))
            {
                totalcount = 0;
                return new list<tresult>();
            }

            indexreader reader = indexreader.open(directory, true);
            indexsearcher searcher = new indexsearcher(reader);

            booleanquery bquery = new booleanquery();
            var keywords = buildqueryaction(bquery);

            sort sort = null;
            var sortfields = getsortfieldsfunc();
            if (sortfields != null)
            {
                sort = new sort();
                sort.setsort(sortfields.toarray());
            }

            topscoredoccollector doccollector = topscoredoccollector.create(1, true);
            searcher.search(bquery, doccollector);
            totalcount = doccollector.totalhits;

            if (totalcount <= 0) return null;

            topdocs resultdocs = searcher.search(bquery, null, pagesize * page, sort);

            dictionary<string, propertyinfo> highlightprops = null;
            list<tresult> results = new list<tresult>();
            int indexstart = (page - 1) * pagesize;
            int indexend = indexstart + pagesize;
            if (totalcount < indexend) indexend = totalcount;

            if (resultdocs != null)
            {
                for (int i = indexstart; i < indexend; i++)
                {
                    document doc = searcher.doc(resultdocs.scoredocs[i].doc);
                    var model = buildresultfunc(doc);
                    if (needhighlight)
                    {
                        model = sethighlighter(keywords, model, ref highlightprops);
                    }

                    results.add(model);
                }
            }

            return results;
        }



        /// <summary>
        /// 设置结果高亮
        /// </summary>
        /// <typeparam name="t"></typeparam>
        /// <param name="dickeywords"></param>
        /// <param name="model"></param>
        /// <param name="props"></param>
        /// <returns></returns>
        private static t sethighlighter<t>(idictionary<string, string> dickeywords, t model, ref dictionary<string, propertyinfo> props)
        {
            simplehtmlformatter simplehtmlformatter = new simplehtmlformatter("<font color=\"red\">", "</font>");
            highlighter highlighter = new highlighter(simplehtmlformatter, new segment());
            highlighter.fragmentsize = 250;

            type modeltype = typeof(t);
            foreach (var item in dickeywords)
            {
                if (!string.isnullorwhitespace(item.value))
                {
                    if (props == null)
                    {
                        props = new dictionary<string, propertyinfo>();
                    }

                    if (!props.containskey(item.key))
                    {
                        props[item.key] = modeltype.getproperty(item.key, bindingflags.ignorecase | bindingflags.public | bindingflags.instance);
                    }

                    var modelprop = props[item.key];
                    if (modelprop.propertytype == typeof(string))
                    {
                        string newvalue = highlighter.getbestfragment(item.value, modelprop.getvalue(model).tostring());
                        if (!string.isnullorempty(newvalue))
                        {
                            modelprop.setvalue(model, newvalue);
                        }
                    }
                }
            }

            return model;
        }


        /// <summary>
        /// 拆分关键词
        /// </summary>
        /// <param name="keywords"></param>
        /// <returns></returns>
        public static string getkeywordssplitbyspace(string keyword)
        {
            pangutokenizer kttokenizer = new pangutokenizer();
            stringbuilder result = new stringbuilder();
            icollection<wordinfo> words = kttokenizer.segmenttowordinfos(keyword);
            foreach (wordinfo word in words)
            {
                if (word == null)
                {
                    continue;
                }
                result.appendformat("{0}^{1}.0 ", word.word, (int)math.pow(3, word.rank));
            }
            return result.tostring().trim();
        }

        /// <summary>
        /// 【辅助方法】创建盘古查询对象
        /// </summary>
        /// <param name="field"></param>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public static query createpanguquery(string field, string keyword, bool needsplit = true)
        {
            if (needsplit)
            {
                keyword = getkeywordssplitbyspace(keyword);
            }

            queryparser parse = new queryparser(lucene.net.util.version.lucene_30, field, new panguanalyzer());
            parse.defaultoperator = queryparser.operator.or;
            query query = parse.parse(keyword);
            return query;
        }

        /// <summary>
        /// 【辅助方法】创建盘古多字段查询对象
        /// </summary>
        /// <param name="keyword"></param>
        /// <param name="fields"></param>
        /// <returns></returns>
        public static query createpangumultifieldquery(string keyword, bool needsplit, params string[] fields)
        {
            if (needsplit)
            {
                keyword = getkeywordssplitbyspace(keyword);
            }

            queryparser parse = new multifieldqueryparser(lucene.net.util.version.lucene_30, fields, new panguanalyzer());
            parse.defaultoperator = queryparser.operator.or;
            query query = parse.parse(keyword);
            return query;
        }

    }
}

 里面除了使用了lucene.net nuget包,还单独引用了pangu分词器及其相关组件,因为大多数情况下我们的内容会包含中文。如上代码就不再细讲了,注释得比较清楚了。下面贴出一些实际的用法:

创建索引:

 searchengineutil.addindex(getsearchindexdir(), post, (doc, data) => buildpostsearchdocument(data, doc));


        private document buildpostsearchdocument(post post, document doc = null)
        {

            if (doc == null)
            {
                doc = new document();//创建document
            }

            doc.add(new field("id", post.id.tostring(), field.store.yes, field.index.not_analyzed));
            doc.add(new field("title", post.title, field.store.yes, field.index.analyzed));
            doc.add(new field("summary", post.summary, field.store.yes, field.index.analyzed));
            doc.add(new field("createtime", post.createtime.tostring("yyyy/mm/dd hh:mm"), field.store.yes, field.index.no));
            doc.add(new field("author", post.isoriginal ? (post.creator ?? userqueryservice.findbyname(post.createby)).nickname : post.sourceby, field.store.yes, field.index.no));

            return doc;
        }

 删除索引:

 searchengineutil.deleteindex(getsearchindexdir(), "id", post.id);

 更新索引:

searchengineutil.updateindex(getsearchindexdir(), "id", post.id, buildpostsearchdocument(post));

 分页查询:

               var keyword = searchengineutil.getkeywordssplitbyspace("梦在旅途 中国梦");
                var searchresult = searchengineutil.searchindexbypage(indexdir, (bquery) =>
                {
                    var query = searchengineutil.createpangumultifieldquery(keyword, false, "title", "summary");
                    bquery.add(query, occur.should);
                    return new dictionary<string, string> {
                    { "title",keyword},{"summary",keyword}
                    };
                }, () =>
                {
                    return new[] { new sortfield("id", sortfield.int, true) };
                }, doc =>
                {
                    return new postsearchinfodto
                    {
                        id = doc.get("id"),
                        title = doc.get("title"),
                        summary = doc.get("summary"),
                        author = doc.get("author"),
                        createtime = doc.get("createtime")
                    };

                }, pagesize, pageno, out totalcount);

其它的还有:判断索引中的指定文档记录存不存在、查询符合条件的索引文档等在此没有列出,大家有兴趣的可以copy到自己的项目中测试一下。

这里可以看一下我在自己的项目中(个人全新改版的自己博客,还在开发中)应用搜索场景的效果:

 

最后说明的是:lucene并不是一个完整的全文检索引擎,但了解它对于学习elasticsearch、solr还是有一定的帮助,目前一般应用于实际的生产项目中,多半是使用更高层的elasticsearch、solr。

 (本文中的代码我是今年很早前就写好了,只是今天才分享出来)

 

我喜欢对一些常用的组件进行封装,比如过往封装有:

基于mongodb官方c#驱动封装mongodbcsharphelper类(crud类)

基于rabbitmq.client组件实现rabbitmq可复用的 connectionpool(连接池)

 

《C#编写了一个基于Lucene.Net的搜索引擎查询通用工具类:SearchEngineUtil.doc》

下载本文的Word格式文档,以方便收藏与打印。