spring boot+全文搜索框架lucene

1.全文搜索框架lucene和mysql like%对比全文索引是查询完然后建立索引，再对搜索的词拆分，再根据索引查找东西。系统需要维护索引。mysql like%用法是全表遍历一遍，效率相对比较慢。2.github(项目)https://github.com/dajitui/spring-boot-lucene-ik明天再详细看看方法...

大鸡腿同学

11711人浏览 · 2018-05-13 23:35:31

大鸡腿同学 · 2018-05-13 23:35:31 发布

1.全文搜索框架lucene和mysql like%对比

全文索引是查询完然后建立索引，再对搜索的词拆分，再根据索引查找东西。系统需要维护索引。

mysql like%用法是全表遍历一遍，效率相对比较慢。

2.github(项目)

https://github.com/dajitui/spring-boot-lucene-ik

3.详细的过程

我初衷也是想和数据库结合的，so

利用jpa查询得到数据，由于一般数据量都是众多的，所以不能用数据库的like进行查询！

得到数据后，需要通过关联lucene版本和分值器，再创建一个文本保存索引，然后写入。

Directory directory=null;
        IndexWriterConfig config=null;
        IndexWriter iwriter=null;
        try {
            //索引库的存储目录
            directory = FSDirectory.open(new File(dir));
            //关联当前lucence版本和分值器
            config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
            //传入目录和分词器
            iwriter = new IndexWriter(directory, config);
            iwriter.commit();
            //写入到目录文件中
            iwriter.addDocument(doc);
            //提交事务
            iwriter.commit();
            //关闭流
            iwriter.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

把数据写进document，再写入刚刚创建的文件里面

                //获取每行数据
                Map<String, Object> lineData = queryFood.get(i);
                //创建Document对象
                Document doc = new Document();
                //获取每列数据
                Field foodid=new Field("foodid",lineData.get("foodid").toString(),TextField.TYPE_STORED);
                Field foodname=new Field("foodname",lineData.get("foodname").toString(),TextField.TYPE_STORED);
                Field price=new Field("price",lineData.get("price").toString(),TextField.TYPE_STORED);
                Field imagepath=new Field("imagepath",lineData.get("imagepath").toString(),TextField.TYPE_STORED);
                //添加到Document中
                doc.add(foodid);
                doc.add(foodname);
                doc.add(price);
                doc.add(imagepath);
                //调用，创建索引库
                indexDemo.write(doc);

当搜索的时候呢？读取文件，通过索引，

//索引库的存储目录
        Directory directory = FSDirectory.open(new File(dir));
        //读取索引库的存储目录
        DirectoryReader ireader = DirectoryReader.open(directory);
        //搜索类
        IndexSearcher isearcher = new IndexSearcher(ireader);
        //lucence查询解析器，用于指定查询的属性名和分词器
        QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer);
        //搜索
        Query query = parser.parse(value);
        //最终被分词后添加的前缀和后缀处理器，默认是粗体<B></B>
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<font color=red>","</font>");
        //高亮搜索的词添加到高亮处理器中
        Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));

        //获取搜索的结果，指定返回document返回的个数
        ScoreDoc[] hits = isearcher.search(query, null, 5).scoreDocs;
        List<Map> list=new ArrayList<Map>();
        //遍历，输出
        for (int i = 0; i < hits.length; i++) {
            int id = hits[i].doc;
            Document hitDoc = isearcher.doc(hits[i].doc);
            Map map=new HashMap();
            map.put("foodid", hitDoc.get("foodid"));

            //获取到foodname
            String foodname=hitDoc.get("foodname");
            //将查询的词和搜索词匹配，匹配到添加前缀和后缀
            TokenStream tokenStream = TokenSources.getAnyTokenStream(isearcher.getIndexReader(), id, "foodname", analyzer);
            //传入的第二个参数是查询的值
            TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, foodname, false, 10);
            String foodValue="";
            for (int j = 0; j < frag.length; j++) {
                if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                    //获取 foodname 的值
                    foodValue=((frag[j].toString()));
                }
            }
            map.put("foodname", foodValue);

            map.put("price", hitDoc.get("price"));
            map.put("imagepath", hitDoc.get("imagepath"));
            list.add(map);
        }
        ireader.close();
        directory.close();
        return list;

luncene索引维护

上面是创建索引

增量添加索引

/**
     * 增加索引
     * 
     * @throws Exception
     */
    public static void insert() throws Exception {
        String text5 = "hello,goodbye,man,woman";
        Date date1 = new Date();
        analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        directory = FSDirectory.open(new File(INDEX_DIR));

        IndexWriterConfig config = new IndexWriterConfig(
                Version.LUCENE_CURRENT, analyzer);
        indexWriter = new IndexWriter(directory, config);

        Document doc1 = new Document();
        doc1.add(new TextField("filename", "text5", Store.YES));
        doc1.add(new TextField("content", text5, Store.YES));
        indexWriter.addDocument(doc1);

        indexWriter.commit();
        indexWriter.close();

        Date date2 = new Date();
        System.out.println("增加索引耗时：" + (date2.getTime() - date1.getTime()) + "ms\n");
    }

删除索引

/**
     * 删除索引
     * 
     * @param str 删除的关键字
     * @throws Exception
     */
    public static void delete(String str) throws Exception {
        Date date1 = new Date();
        analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        directory = FSDirectory.open(new File(INDEX_DIR));

        IndexWriterConfig config = new IndexWriterConfig(
                Version.LUCENE_CURRENT, analyzer);
        indexWriter = new IndexWriter(directory, config);
        
        indexWriter.deleteDocuments(new Term("filename",str));  
        
        indexWriter.close();
        
        Date date2 = new Date();
        System.out.println("删除索引耗时：" + (date2.getTime() - date1.getTime()) + "ms\n");
    }

更新索引

/**
     * 更新索引
     * 
     * @throws Exception
     */
    public static void update() throws Exception {
        String text1 = "update,hello,man!";
        Date date1 = new Date();
         analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
         directory = FSDirectory.open(new File(INDEX_DIR));

         IndexWriterConfig config = new IndexWriterConfig(
                 Version.LUCENE_CURRENT, analyzer);
         indexWriter = new IndexWriter(directory, config);
         
         Document doc1 = new Document();
        doc1.add(new TextField("filename", "text1", Store.YES));
        doc1.add(new TextField("content", text1, Store.YES));
        
        indexWriter.updateDocument(new Term("filename","text1"), doc1);
        
         indexWriter.close();
         
         Date date2 = new Date();
         System.out.println("更新索引耗时：" + (date2.getTime() - date1.getTime()) + "ms\n");
    }

根据索引查询

/**
     * 关键字查询
     * 
     * @param str
     * @throws Exception
     */
    public static void search(String str) throws Exception {
        directory = FSDirectory.open(new File(INDEX_DIR));
        analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        DirectoryReader ireader = DirectoryReader.open(directory);
        IndexSearcher isearcher = new IndexSearcher(ireader);

        QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "content",analyzer);
        Query query = parser.parse(str);

        ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
        for (int i = 0; i < hits.length; i++) {
            Document hitDoc = isearcher.doc(hits[i].doc);
            System.out.println(hitDoc.get("filename"));
            System.out.println(hitDoc.get("content"));
        }
        ireader.close();
        directory.close();
    }

一门面向 Data 和 AI 的低代码、云原生的开源编程语言

无需安装部署，在线快速体验 Byzer

更多推荐

编译时权限控制

前言权限控制，对于 MLSQL 而言的重要程度可以说是生命线。 MLSQL 需要面对各式各样的资源访问，比如 MySQL, Oracle,HDFS，Hive，Kafka，Sorl，ElasticSearch，Redis，API，Web等等，不同用户对这些数据源（以及表，列）的权限是不一样的。传统模式是，每个用户都需要有个 proxy user，然后到每个数据源里面给这个 proxy user

Byzer 白泽

Byzer 支持 JDBC 聚合下推

聚合下推PR链接聚合下推我们知道 Byzer 支持多数据源和联邦查询，可以方便分析师在一个平台上快速的分析来自多种数据源的数据，从而进行灵活的探索式分析。使用场景和优势当前 Byzer 中加载数据的方式会拉取明细数据到 spark 中进行聚合计算，对于小数据量或者在分布式文件系统上的数据源来说是常规操作。但是对于 JDBC 数据源或者有分析能力的 OLAP 系统来说拉取明细数据可能就不是最

Byzer 白泽

Byzer 术语表

Byzer-lang Byzer ，又称为 Byzer-lang，一门面向 Data 和 AI 的低代码、云原生的开源编程语言。 Byzer 是一门结合了声明式编程和命令式编程的混合编程语言，其低代码且类 SQL 的编程逻辑配合内置算法及插件的加持，能帮助数据工作者们高效打通数据链路，完成数据的清洗转换，并快速地进行机器学习相关的训练及预测。 Byzer 希望能够提供一套语言、一个引擎，就能覆盖整