网站首页 > 开源技术正文

搜索引擎Lucene的秘密，在10W篇文章中的表现如何?

wxchong 2024-10-23 15:50:33 开源技术 19 ℃ 0 评论

简介

之前说过搜索引擎Lucene的原理，详情可见5分钟了解搜索引擎Lucene的原理，知道Lucene是通过反向索引和数据分段来进行索引的存储的，这次开测试一下性能到底怎么样！

测试思路

生成大量的文本文件，每个文本文件5000个随机中文字符，搜索某一个关键字，用Lucene创建索引搜索和顺序扫描搜索做耗时对比，控制文本文件的数量记录下两者的耗时。

生成的文本如下，可以把一个文本文件当做成一篇"文章“：

挎钚箽氝炲汞滑牉塹鮆偆蛖複栟潏檵...
鑷鹹煴疇稚挷怎桗伖桴踎卼烺錢耻咹...
鄂羣弇伎艥塅緳剆畊鬑情吾蔱鶻汅擳...
犵冡蠷敳訙鼴泙倎劉呖垱躓嚓祠摭鑭...
训莈穰佢宔髒货爠礬穳崋暝並吃鰞乩...

实验不足：生成的文本文件是随机中文字符，不能代表实际中的使用场景，比较贴近实际的是去网络中爬取文章，但是耗时耗力，暂时没时间做。

实验结果

可以看到，在文本数量在1000以内的时候，Lucene和顺序扫描是不相上下的，甚至在文本数量很少的时候，顺序扫描是快于Lucene的，但是随着文本数量的上升，顺序扫描耗时线性增长，而Lucene几乎波动很小，当文本数量到达50000时，顺序扫描需要8.4秒左右，已经到了无法忍受的地步了！

而当文本数量到达10W时，Lucene需要116毫秒，而顺序扫描则需要37.9秒！

当文本数量为50000时，生成的原始文本文件大小为781M，Lucene创建的索引大小为1.28G，可作为占用空间的参考，其他的数量下，忘记录了

测试程序

测试程序如下，感兴趣的自己跑跑


import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Before;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.*;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.Random;

/**
 * 效率对比
 * 从大量文本中顺序扫描  VS 全文检索
 *
 *  maxFileAmount = 50
 * 生成完毕，耗时：42
 * 索引完毕，耗时：790
 * 总记录数：4 hits
 * 搜索(全文检索)完毕，耗时：81
 * 总记录数：4
 * 搜索(扫描)完毕，耗时：9
 *
 *
 *  maxFileAmount = 100
 * 生成完毕，耗时：70
 * 索引完毕，耗时：1328
 * 总记录数：27 hits
 * 搜索(全文检索)完毕，耗时：69
 * 总记录数：27
 * 搜索(扫描)完毕，耗时：11
 *
 *
 *  maxFileAmount = 1000
 * 生成完毕，耗时：649
 * 索引完毕，耗时：4181
 * 总记录数：207 hits
 * 搜索(全文检索)完毕，耗时：75
 * 总记录数：207
 * 搜索(扫描)完毕，耗时：103
 *
 * maxFileAmount = 5000
 * 生成完毕，耗时：2770
 * 索引完毕，耗时：14434
 * 总记录数：1071 hits
 * 搜索(全文检索)完毕，耗时：85
 * 总记录数：1071
 * 搜索(扫描)完毕，耗时：453
 *
 * maxFileAmount = 10000
 * 生成完毕，耗时：6258
 * 索引完毕，耗时：36224
 * 总记录数：2062 hits
 * 搜索(全文检索)完毕，耗时：52
 * 总记录数：2062
 * 搜索(扫描)完毕，耗时：888
 *
 * maxFileAmount = 20000
 * 生成完毕，耗时：11367
 * 索引完毕，耗时：66083
 * 总记录数：4207 hits
 * 搜索(全文检索)完毕，耗时：62
 * 总记录数：4207
 * 搜索(扫描)完毕，耗时：1717
 *
 *
 * maxFileAmount = 50000
 * 生成完毕，耗时：34962 781M
 * 索引完毕，耗时：164616 1.28G
 * 总记录数：10650 hits
 * 搜索(全文检索)完毕，耗时：87
 * 总记录数：10650
 * 搜索(扫描)完毕，耗时：8453
 *
 *
 *  maxFileAmount = 100000
 * 生成完毕，耗时：73263
 * 索引完毕，耗时：316342
 * 总记录数：21106 hits
 * 搜索完毕，耗时：116
 * 总记录数：21106
 * 搜索(扫描)完毕，耗时：37944
 */
public class EfficientContrast {

    String dataPath = "D:\\TEMP";//文本文件存储路径

    String indexPath = "C:\\temp\\index";//索引文件存储路径

    int maxFileAmount = 50000;//文本文件个数

    @Test
    public void mainTest() throws Exception {
        String searchText = "兄";

        long s =  System.currentTimeMillis();
        generateBigTextFile();
        System.out.println("生成完毕，耗时："+ ((System.currentTimeMillis() - s) - /*后面为0，目的是赋值*/ ((s = System.currentTimeMillis())-s)) );

        generateLuceneIndex(indexPath);
        System.out.println("索引完毕，耗时："+ ((System.currentTimeMillis() - s) - /*后面为0，目的是赋值*/ ((s = System.currentTimeMillis())-s)) );

        searchByLucene(indexPath,searchText);
        System.out.println("搜索(全文检索)完毕，耗时："+ ((System.currentTimeMillis() - s) - /*后面为0，目的是赋值*/ ((s = System.currentTimeMillis())-s)) );

        searchByScanOneByOne(searchText);
        System.out.println("搜索(扫描)完毕，耗时："+ ((System.currentTimeMillis() - s) - /*后面为0，目的是赋值*/ ((s = System.currentTimeMillis())-s)) );
    }



    /**
     * 生成大量文本，类似于
     挎钚箽氝炲汞滑牉塹鮆偆蛖複栟潏檵...
     鑷鹹煴疇稚挷怎桗伖桴踎卼烺錢耻咹...
     鄂羣弇伎艥塅緳剆畊鬑情吾蔱鶻汅擳...
     犵冡蠷敳訙鼴泙倎劉呖垱躓嚓祠摭鑭...
     训莈穰佢宔髒货爠礬穳崋暝並吃鰞乩...
     ...
     * @throws Exception
     */
    void generateBigTextFile() throws Exception {
        //File f = new File(dataPath);
        FileUtils.cleanDirectory(new File(dataPath));
        for (int i = 0; i < maxFileAmount; i++) {
            File f = new File(dataPath+"/" + i +".txt");
            FileOutputStream fis = new FileOutputStream(f);
            char[] chars = new char[5000];//一个文本文件5000个字
            /**
             *
             字符集       字数        Unicode 编码
             基本汉字   20902字  4E00-9FA5
             */
            for (int j = 0; j < chars.length; j++) {
                if (j % 100 == 0){//100个字换一行
                    chars[j] = '\n';
                } else {
                    int charIdx = new Random().nextInt(0x9f9f - 0x4e00) + 0x4e00;
                    chars[j] = (char)charIdx;
                }
            }
            fis.write(new String(chars).getBytes());
            fis.flush();
        }

    }


    /**
     * 生成索引
     * @param indexPath
     * @throws IOException
     */
    void generateLuceneIndex(String indexPath) throws IOException {
        FileUtils.cleanDirectory(new File(indexPath));
        //1、创建一个Director对象，指定索引库保存的位置。
        //把索引库保存在内存中
        //Directory directory = new RAMDirectory();
        //把索引库保存在磁盘
        Directory directory = FSDirectory.open(new File(indexPath).toPath());
        //2、基于Directory对象创建一个IndexWriter对象
        StandardAnalyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        IndexWriter indexWriter = new IndexWriter(directory, config);
        //3、读取磁盘上的文件，对应每个文件创建一个文档对象。

        //路径中文需要decode一下
        File dir = new File(dataPath);
        File[] files = dir.listFiles();
        for (File f : files) {
            //取文件名
            String fileName = f.getName();
            //文件的路径
            String filePath = f.getPath();
            //文件的内容
            String fileContent = FileUtils.readFileToString(f, "utf-8");
            //文件的大小
            long fileSize = FileUtils.sizeOf(f);
            //创建Field
            //参数1：域的名称，参数2：域的内容，参数3：是否存储
            Field fieldName = new TextField("name", fileName, Field.Store.YES);
            //Field fieldPath = new TextField("path", filePath, Field.Store.YES);
            Field fieldPath = new StoredField("path", filePath);
            Field fieldContent = new TextField("content", fileContent, Field.Store.YES);
            //Field fieldSize = new TextField("size", fileSize + "", Field.Store.YES);
            Field fieldSizeValue = new LongPoint("size", fileSize);
            Field fieldSizeStore = new StoredField("size", fileSize);
            //创建文档对象
            Document document = new Document();
            //向文档对象中添加域
            document.add(fieldName);
            document.add(fieldPath);
            document.add(fieldContent);
            //document.add(fieldSize);
            document.add(fieldSizeValue);
            document.add(fieldSizeStore);
            //5、把文档对象写入索引库
            indexWriter.addDocument(document);
        }
        //6、关闭indexwriter对象
        indexWriter.close();
    }


    /**
     * 全文检索搜索
     * @param indexPath
     * @param searchText
     * @throws IOException
     * @throws ParseException
     */
    void searchByLucene(String indexPath,String searchText) throws IOException, ParseException {
        /**
         * 再用QueryParser查询
         */
        Directory directory = FSDirectory.open(new File(indexPath).toPath());
        //2、基于Directory对象创建一个IndexWriter对象
        StandardAnalyzer analyzer = new StandardAnalyzer();
        long s1 = System.currentTimeMillis();
        IndexReader indexReader = DirectoryReader.open(FSDirectory.open(new File(indexPath).toPath()));
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        //创建一个QueryPaser对象，两个参数
        //参数1：默认搜索域，参数2：分析器对象
        QueryParser queryParser = new QueryParser("content", analyzer);
        //使用QueryPaser对象创建一个Query对象
        Query query = queryParser.parse(searchText);
        //执行查询
        TopDocs topDocs = indexSearcher.search(query,indexReader.numDocs());//搜索全部
        System.out.println("总记录数：" + topDocs.totalHits);
    }


    /**
     * 顺序扫描搜索
     * @param searchText
     * @throws IOException
     * @throws ParseException
     */
    void searchByScanOneByOne(String searchText) throws IOException, ParseException {
        int cnt = 0;
        File dir = new File(dataPath);
        File[] files = dir.listFiles();
        for (File f : files) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
            for (String line = reader.readLine();line != null;line=reader.readLine()){
                if (line.contains(searchText)){
                    cnt++;
                    break;
                }
            }
        }
        System.out.println("总记录数：" + cnt);
    }
}

所需要的依赖如下：

<dependencies>
    <!--Test-->
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
<!--      <scope>test</scope>-->
    </dependency>

    <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
    <dependency>
      <groupId>commons-io</groupId>
      <artifactId>commons-io</artifactId>
      <version>2.6</version>
    </dependency>



    <!--lucene-->
    <!--核心包-->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-core</artifactId>
      <version>8.7.0</version>
    </dependency>

    <!--一般分词器，适用于英文分词-->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-analyzers-common</artifactId>
      <version>8.7.0</version>
    </dependency>

    <!--中文分词器，一般不用这个，对扩展性支持不好-->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-analyzers-smartcn</artifactId>
      <version>8.7.0</version>
    </dependency>

    <!--对分词索引查询解析-->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queryparser</artifactId>
      <version>8.7.0</version>
    </dependency>

    <!--IKAnalyzer-->
    <dependency>
      <groupId>com.jianggujin</groupId>
      <artifactId>IKAnalyzer-lucene</artifactId>
      <version>8.0.0</version>
    </dependency>
  </dependencies>

上一篇：分享如何轻松搭建单机solr服务，轻松成为大牛
下一篇： Lucene的索引详解（loc索引）

网站首页 > 开源技术正文

搜索引擎Lucene的秘密，在10W篇文章中的表现如何?

简介

测试思路

实验结果

测试程序

猜你喜欢

本文暂时没有评论，来添加一个吧(●'◡'●)

取消回复欢迎你发表评论:

网站首页 > 开源技术 正文

搜索引擎Lucene的秘密，在10W篇文章中的表现如何?

简介

测试思路

实验结果

测试程序

猜你喜欢

本文暂时没有评论，来添加一个吧(●'◡'●)

取消回复欢迎 你 发表评论:

网站首页 > 开源技术正文

取消回复欢迎你发表评论: