ElasticSearch安装中文分词插件

ES的常用的中文分词有基于汉字的ik和基于拼音的pinyin

	cd /usr/local/src/elasticsearch-6.5.2
	./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.5.2/elasticsearch-analysis-ik-6.5.2.zip
关于中文分词
	POST   /my_index/_analyze
	{
		"analyzer":"ik_max_word",
		"text":"没有到医院开处方订购处方药是非法的"
	}
	POST   /my_index/_analyze
	{
		"analyzer":"ik_smart",
		"text":"没有到医院开处方订购处方药是非法的"
	}
	POST   /my_index/_analyze
	{
		"analyzer":"smartcn",
		"text":"没有到医院开处方订购处方药是非法的"
	}
可以看出最不准的就是smartcn,这个中文分析器尽可能不要用。
ElasticSearch6.5 安装中文拼音分词插件
cd /usr/local/src/elasticsearch-6.5.2
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v6.5.2/elasticsearch-analysis-pinyin-6.5.2.zip
创建索引,配置两种分词器
PUT /blog/
{
	"settings":{
		"number_of_shards": 3,
		"number_of_replicas": 1,
		"analysis": {
            "analyzer": {
				"default":{
					"tokenizer":"ik_max_word"
				},				
                "pinyin_analyzer": {
                    "type": "custom",
                    "tokenizer": "my_pinyin",
                    "filter": ["word_delimiter"]
                }
            },
            "tokenizer": {
                "my_pinyin" : {
                      "type" : "pinyin",
                      "keep_first_letter":true,
                      "keep_separate_first_letter" : false,
                      "keep_full_pinyin" : true,
                      "keep_original" : false,
                      "limit_first_letter_length" : 16,
                      "lowercase" : true
                  }
            }
        }
	}
}
针对索引配置字段映射
PUT /blog/_mapping/article
{    
	"properties" : {
		"title" : {
			"type" : "text",
			"analyzer" : "ik_max_word",
			"include_in_all" : true,
			"fields" : {
				"pinyin" : {
					"type" : "text",
					"term_vector" : "with_positions_offsets",
					"analyzer" : "pinyin_analyzer",
					"boost" : 10
				  }
			 }
		},      
		"content":{
		  "type":"text",
		  "analyzer" : "ik_max_word"
		},
		"author":{
		  "type":"text",
		  "analyzer" : "ik_max_word"
		},
		"keyword":{
		  "type":"text",
		  "analyzer" : "ik_max_word"
		},
		"createtime":{			
			"type":"date",			
			"format":"yyyy-MM-dd HH:mm:ss || yyyy-MM-dd || epoch_millis"	
		}
	}
}
检查自定义的词语分析器是否生效
GET /blog/_analyze
{
	"text":"刘德华",
	"analyzer":"pinyin_analyzer"
}
填充一些内容
POST /blog/article/1
{
    "title":"张靓颖2018年演唱会将在北京举行",
    "content":"张靓颖2018年跨年演唱会将在北京市海淀剧院举行",
    "author":"王军",
    "createtime":"2018-12-26"
}
按照拼音搜索
POST /blog/article/_search
{
    "query":{
      "match":{
        "title.pinyin":"liudehua"
      }
    }
}
按照中文名进行搜索
POST /blog/article/_search
{
    "query":{
      "match":{
        "title":"刘德华"
      }
    }
}
按照中文加拼音一起搜索
POST blog/article/_search
{
  "query": {
    "multi_match": {
      "type":"most_fields",
      "query":"刘德h",
      "fields":["title", "title.pinyin"]
    }
  }
}
最终结果:
{
  "took" : 76,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : 14.556091,
    "hits" : [
      {
        "_index" : "blog",
        "_type" : "article",
        "_id" : "1",
        "_score" : 14.556091,
        "_source" : {
          "title" : "刘德华2018年演唱会将在北京举行",
          "content" : "刘德华2018年跨年演唱会将在北京市朝阳区鸟巢体育馆举行",
          "author" : "王军",
          "createtime" : "2018-12-26"
        }
      }
    ]
  }
}
通过 validate-query API 来执行这个查询可以帮助你理解查询是如何执行的
GET blog/article/_validate/query?explain
{
  "query": {
    "multi_match": {
      "type":"most_fields",
      "query":"刘德h",
      "fields":["title", "title.pinyin"]
    }
  }
}
执行结果:
{
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "failed" : 0
  },
  "valid" : true,
  "explanations" : [
    {
      "index" : "blog",
      "valid" : true,
      "explanation" : "+((title:刘 title:德 title:h) | ((title.pinyin:liu)^10.0 (title.pinyin:de)^10.0 Synonym(title.pinyin:h title.pinyin:ldh)))~1.0 #*:*"
    }
  ]
}
Logo

更多推荐