elasticsearch之实例篇
本文上接《elasticsearch之搭建篇》,看看如何实现类似糯米的检索功能。
在商铺和商品的存储方面,有嵌套(Nested)和父子文档(Parent-Child)两种方式,下面将依次探索。
嵌套(Nested)
创建type
商品将嵌套在商铺的文档中,其创建方法如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
<?php require_once __DIR__ . "/vendor/autoload.php"; // 客户端 $client = Elasticsearch\ClientBuilder::fromConfig([ 'hosts' => ['localhost:9200', 'localhost:9201', 'localhost:9203'], // 最好在为ES集群搭建Haproxy反向代理 'retries' => 2 ]); // 创建商铺type $indices = $client->indices(); // 先删除旧的basic索引 $indices->delete(['index' => 'basic']); // 创建basic索引的同时指定商铺的type mapping $indices->create([ 'index' => 'basic', 'body' => [ // index配置 'settings' => [ "number_of_shards" => 3, // 3个分区 "number_of_replicas" => 2, // 每个分区有1个主分片和2个从分片 ], // type映射 'mappings' => [ // 商铺 'merchant' => [ // 属性 'properties' => [ // 商铺名称 'merchant_name' => [ 'type' => 'string', // 字符串 'index' => 'analyzed', // 全文索引 'analyzer' => 'ik_max_word', // 中文分词 ], // 商铺图片 'merchant_img' => [ 'type' => 'string', // 字符串 'index' => 'no', // 不索引 ], // 商铺类型 'merchant_type' => [ 'type' => 'string', // 字符串 'index' => 'not_analyzed', // 不分词,直接索引 ], // 用户评分 'merchant_score' => [ 'type' => 'integer', // 整形 'index' => 'not_analyzed', // 直接索引,用于过滤/排序 ], // 人均价格 'merchant_avg_price' => [ 'type' => 'integer', // 整形 'index' => 'not_analyzed', // 直接索引,用于过滤/排序 ], // 地理坐标 'merchant_location' => [ 'type' => 'geo_point', // 地址坐标 ], // 嵌套商品列表 'merchant_product' => [ 'type' => 'nested', // 嵌套文档 'properties' => [ // 商品ID 'product_id' => [ 'type' => 'long', // 长整形 'index' => 'not_analyzed', // 不分词,直接索引 ], // 商品名称 'product_name' => [ 'type' => 'string', // 字符串 'index' => 'analyzed', // 全文索引 'analyzer' => 'ik_max_word', // 中文分词 ], // 商品图片 'product_img' => [ 'type' => 'string', // 字符串 'index' => 'no', // 不索引 ], // 商品类型 'product_type' => [ 'type' => 'string', // 字符串 'index' => 'not_analyzed', // 不分词,直接索引 ], // 商品价格 'product_price' => [ 'type' => 'integer', // 整形 'index' => 'not_analyzed', // 直接索引,用于过滤/排序 ], // 商品销量 'product_sold' => [ 'type' => 'integer', // 整形 'index' => 'not_analyzed', // 直接索引,用于排序/过滤 ] ] ] ] ], ] ], ]); |
可见,商品列表作为一个属性存储在商铺中(type=nested,嵌套的),一个商铺有多个商品。(嵌套文档可以在这里学习)
通过curl查看刚刚建立的商铺type:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
[work@df6c675da97e nuomi-search]$ curl localhost:9200/basic?pretty { "basic" : { "aliases" : { }, "mappings" : { "merchant" : { "properties" : { "merchant_avg_price" : { "type" : "integer" }, "merchant_img" : { "type" : "keyword", "index" : false }, "merchant_location" : { "type" : "geo_point" }, "merchant_name" : { "type" : "text", "analyzer" : "ik_max_word" }, "merchant_product" : { "type" : "nested", "properties" : { "product_id" : { "type" : "long" }, "product_img" : { "type" : "keyword", "index" : false }, "product_name" : { "type" : "text", "analyzer" : "ik_max_word" }, "product_price" : { "type" : "integer" }, "product_sold" : { "type" : "integer" }, "product_type" : { "type" : "keyword" } } }, "merchant_score" : { "type" : "integer" }, "merchant_type" : { "type" : "keyword" } } } }, "settings" : { "index" : { "creation_date" : "1489890529746", "number_of_shards" : "3", "number_of_replicas" : "2", "uuid" : "sY5hH9kqQLq2mmZyW9HQmA", "version" : { "created" : "5020299" }, "provided_name" : "basic" } } } } |
上述信息与我创建时传入的一致,表明:商铺type已经成功建立,它利用嵌套文档技术,在其内部直接保存商品列表。
录入数据
我们通过bulk API批量的插入测试数据:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
<?php require_once __DIR__ . "/vendor/autoload.php"; // 客户端 $client = Elasticsearch\ClientBuilder::fromConfig([ 'hosts' => ['localhost:9200', 'localhost:9201', 'localhost:9203'], // 最好在为ES集群搭建Haproxy反向代理 'retries' => 2 ]); // 批量插入测试数据 $client->bulk([ 'index' => 'basic', 'type' => 'merchant', 'body' => [ // index索引请求,元信息是['_id':1] ['index' => ['_id' => 1]], // _id就是店铺的ID(一般来自于Mysql) // 请求体 [ // 主文档 'merchant_name' => '鑫明明拉面', 'merchant_score' => 4, 'merchant_type' => '美食', 'merchant_img' => 'http://merchant.com/1.jpg', 'merchant_avg_price' => 2100, 'merchant_location' => [120.3945890000, 36.0705170000], 'merchant_product' => [ // 嵌套文档列表 [ 'product_id' => 1, 'product_name' => '羊肉烩面', 'product_type' => '面食', 'product_img' => 'http://product.com/2.jpg', 'product_sold' => 11, 'product_price' => 2200, ], [ 'product_id' => 2, 'product_name' => '烤羊肉串', 'product_type' => '烤串', 'product_img' => 'http://product.com/3.jpg', 'product_sold' => 12, 'product_price' => 2300, ], ] ], ['index' => ['_id' => 2]], [ 'merchant_name' => '东方宫兰州拉面', 'merchant_score' => 3, 'merchant_type' => '美食', 'merchant_img' => 'http://merchant.com/2.jpg', 'merchant_avg_price' => 1800, 'merchant_location' => [36.0693500000, 120.3928290000], 'merchant_product' => [ [ 'product_id' => 3, 'product_name' => '牛肉炒面', 'product_type' => '面食', 'product_img' => 'http://product.com/4.jpg', 'product_sold' => 10, 'product_price' => 2400, ], [ 'product_id' => 4, 'product_name' => '蛋炒饭', 'product_type' => '主食', 'product_img' => 'http://product.com/5.jpg', 'product_sold' => 14, 'product_price' => 2300, ], [ 'product_id' => 5, 'product_name' => '羊肉汤', 'product_type' => '汤粉', 'product_img' => 'http://product.com/6.jpg', 'product_sold' => 10, 'product_price' => 2200, ], ] ], ['index' => ['_id' => 3]], [ 'merchant_name' => '开海饭店', 'merchant_score' => 3, 'merchant_type' => '美食', 'merchant_img' => 'http://merchant.com/3.jpg', 'merchant_avg_price' => 3500, 'merchant_location' => [120.4051170000, 36.0683000000], 'merchant_product' => [ [ 'product_id' => 6, 'product_name' => '海鲜炒饭', 'product_type' => '主食', 'product_img' => 'http://product.com/7.jpg', 'product_sold' => 10, 'product_price' => 2400, ], [ 'product_id' => 7, 'product_name' => '西红柿鸡蛋面', 'product_type' => '面食', 'product_img' => 'http://product.com/8.jpg', 'product_sold' => 10, 'product_price' => 2300, ], [ 'product_id' => 8, 'product_name' => '鸭血粉丝汤', 'product_type' => '汤粉', 'product_img' => 'http://product.com/9.jpg', 'product_sold' => 10, 'product_price' => 2200, ], [ 'product_id' => 9, 'product_name' => '兰州炒饭', 'product_type' => '主食', 'product_img' => 'http://product.com/10.jpg', 'product_sold' => 15, 'product_price' => 2500, ], ] ], ] ]); |
- 商铺ID作为ES文档的唯一_id。
- 商品ID作为普通字段保存在嵌套文档的product_id字段。
嵌套查询
因为商铺和商品是嵌套关系,所以在查询时需要使用”嵌套查询”语法。
我的查找需求表达如下:
若某商铺的名称或者其售卖的”商品”的名称,能够匹配”搜索关键字”,那么返回该商铺的信息。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
<?php require_once __DIR__ . "/vendor/autoload.php"; // 客户端 $client = Elasticsearch\ClientBuilder::fromConfig([ 'hosts' => ['localhost:9200', 'localhost:9201', 'localhost:9203'], // 最好在为ES集群搭建Haproxy反向代理 'retries' => 2 ]); // 搜索关键字 $keyword = '东方宫拉面'; // 嵌套查询 $result = $client->search([ 'index' => 'basic', // 数据库 'type' => 'merchant', // 表 'body' => [ // 查询体 'query' => [ // 查询请求,影响相关性打分 'bool' => [ // 布尔组合 'should' => [ // 各个子句相当于或的关系 // 第1项 [ // 全文匹配 'match' => ['merchant_name' => $keyword], // 商铺名 ], // 第2项 [ // 嵌套查询 'nested' => [ 'path' => 'merchant_product', // 子文档的路径 'score_mode' => 'max', // 子文档的评分方式(max表示取最多个子文档中最匹配的那个的相关性) 'query' => [ // 子文档查询请求,影响相关性打分 'match' => [ // 全文匹配 'merchant_product.product_name' => $keyword, // 商品名(必须全路径) ] ] ] ] ] ] ], ] ]); print_r($result); |
分析一下这个查询的组成部分(注意配合代码注释理解):
- query:查询语句必须放在其内部。
- bool:组合查询,可以表达多个子句之间的AND(must),OR(should),NOT(must_not)关系。
- should:OR的意思,里面多个子句满足任意一个即匹配,这个例子有2个子句。
- should的总相关性是这样计算的:所有子句的相关性和/子句的数量。
- match:全文匹配,会对$keyword分词,然后分别进行倒排查找。
- 该查询是should的第一个子句,会匹配得到一个相关性。
- nested:嵌套查询,是should的第二个子句。
- path:嵌套查询指向的子文档路径。
- score_mode:嵌套文档有多个,该参数指定nested子句的总相关性是如何计算的。
- 这里指定max,表示取多个商品中的最大相关性。
- query:对于嵌套查询来说,查询语句必须放在其内部。
- match:全文匹配,会对$keyword分词,然后分别进行倒排查找。
- 该查询是nested query的唯一子句,产生的相关性是nested的总相关性,也是should第二个子句的相关性。
其结果如下:
|
[work@df6c675da97e nuomi-search]$ php main.php Array ( [took] => 21 [timed_out] => [_shards] => Array ( [total] => 3 [successful] => 3 [failed] => 0 ) [hits] => Array ( [total] => 3 [max_score] => 2.5505729 [hits] => Array ( [0] => Array ( [_index] => basic [_type] => merchant [_id] => 1 [_score] => 2.5505729 [_source] => Array ( [merchant_name] => 鑫明明拉面 [merchant_score] => 4 [merchant_type] => 美食 [merchant_img] => http://merchant.com/1.jpg [merchant_avg_price] => 2100 [merchant_location] => Array ( [0] => 127 [1] => 128 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 1 [product_name] => 羊肉烩面 [product_type] => 面食 [product_img] => http://product.com/2.jpg [product_sold] => 11 [product_price] => 2200 ) [1] => Array ( [product_id] => 2 [product_name] => 烤羊肉串 [product_type] => 烤串 [product_img] => http://product.com/3.jpg [product_sold] => 12 [product_price] => 2300 ) ) ) ) [1] => Array ( [_index] => basic [_type] => merchant [_id] => 2 [_score] => 2.0315127 [_source] => Array ( [merchant_name] => 东方宫兰州拉面 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/2.jpg [merchant_avg_price] => 1800 [merchant_location] => Array ( [0] => 120 [1] => 120 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 3 [product_name] => 牛肉炒面 [product_type] => 面食 [product_img] => http://product.com/4.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 4 [product_name] => 蛋炒饭 [product_type] => 主食 [product_img] => http://product.com/5.jpg [product_sold] => 14 [product_price] => 2300 ) [2] => Array ( [product_id] => 5 [product_name] => 羊肉汤 [product_type] => 汤粉 [product_img] => http://product.com/6.jpg [product_sold] => 10 [product_price] => 2200 ) ) ) ) [2] => Array ( [_index] => basic [_type] => merchant [_id] => 3 [_score] => 1.0982643 [_source] => Array ( [merchant_name] => 开海饭店 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/3.jpg [merchant_avg_price] => 3500 [merchant_location] => Array ( [0] => 50 [1] => 50 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 6 [product_name] => 海鲜炒饭 [product_type] => 主食 [product_img] => http://product.com/7.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 7 [product_name] => 西红柿鸡蛋面 [product_type] => 面食 [product_img] => http://product.com/8.jpg [product_sold] => 10 [product_price] => 2300 ) [2] => Array ( [product_id] => 8 [product_name] => 鸭血粉丝汤 [product_type] => 汤粉 [product_img] => http://product.com/9.jpg [product_sold] => 10 [product_price] => 2200 ) [3] => Array ( [product_id] => 9 [product_name] => 兰州炒饭 [product_type] => 主食 [product_img] => http://product.com/10.jpg [product_sold] => 15 [product_price] => 2500 ) ) ) ) ) ) ) |
查询结果出乎意料!
直观来看,”东方宫兰州拉面”应该更符合我的预期,为什么”鑫明明拉面”的相关性却高于”东方宫兰州拉面”呢?
首先,我们要了解相关性,它表示搜索词与文档的匹配程度。
搜索引擎一般采用『词频/逆向文档频率 (TF/IDF)』来计算相关性:
- TF:单词(TERM)在一个文档内的出现比例,出现越多说明TERM对这篇文章更重要,比如本文的”ES”就多次出现。
- IDF:单词(TERM)在所有文档中出现的比例,出现越多说明TERM越大众越不起眼,比如”的,了,吧”这些助词。
- TF/IDF:TERM在文档内的TF越高,在所有文档中的IDF越小,说明TERM与该文档越相关。
通过肉眼分析,”东方宫兰州拉面”这家店无论从名字还是商品的名字都更贴近于我的搜索词”东方宫拉面”,是ES有BUG吗?
答案:并不是,出现这个现象的原因是因为不准确的IDF!
我的商铺表有3个分片Shard,通过查看获知”东方宫兰州拉面”独自在分片1中,而”鑫明明拉面”和”开海饭店”在分片2中。
ES在计算IDF的时候是基于分片内的数据统计的,分片1内的”拉面”只出现在”东方宫兰州拉面”内,相当于100%的IDF(在所有文档内出现);分片2内的”拉面”只出现在”鑫明明拉面”内,而”开海饭店”里并没有出现,相当于50%的IDF(在1/2的文档内出现),讲到这里我们就明白了:”东方宫兰州拉面”和”鑫明明拉面”中”拉面”都出现了1次,但是前者的IDF是1,而后者是1/2,经过TF/IDF计算显然是后者的值更大,也就是更相关了!
这个问题在数据规模较大的情况下可以忽略,在我们开发阶段可以通过指定一个参数解决:search_type=dfs_query_then_fetch,它将获取集群所有分片的IDF和之后再计算TF/IDF,因此更加准确。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
<?php require_once __DIR__ . "/vendor/autoload.php"; // 客户端 $client = Elasticsearch\ClientBuilder::fromConfig([ 'hosts' => ['localhost:9200', 'localhost:9201', 'localhost:9203'], // 最好在为ES集群搭建Haproxy反向代理 'retries' => 2 ]); // 搜索关键字 $keyword = '东方宫拉面'; // 嵌套查询 $result = $client->search([ 'index' => 'basic', // 数据库 'type' => 'merchant', // 表 'search_type' => 'dfs_query_then_fetch', // 汇总IDF计算相关 'body' => [ // 查询体 'query' => [ // 查询请求,影响相关性打分 'bool' => [ // 布尔组合 'should' => [ // 各个子句相当于或的关系 // 第1项 [ // 全文匹配 'match' => ['merchant_name' => $keyword], // 商铺名 ], // 第2项 [ // 嵌套查询 'nested' => [ 'path' => 'merchant_product', // 子文档的路径 'score_mode' => 'max', // 子文档的评分方式(max表示取最多个子文档中最匹配的那个的相关性) 'query' => [ // 子文档查询请求,影响相关性打分 'match' => [ // 全文匹配 'merchant_product.product_name' => $keyword, // 商品名(必须全路径) ] ] ] ] ] ] ], ] ]); print_r($result); |
这次结果正确:
|
[work@df6c675da97e nuomi-search]$ php main.php Array ( [took] => 30 [timed_out] => [_shards] => Array ( [total] => 3 [successful] => 3 [failed] => 0 ) [hits] => Array ( [total] => 3 [max_score] => 3.5293567 [hits] => Array ( [0] => Array ( [_index] => basic [_type] => merchant [_id] => 2 [_score] => 3.5293567 [_source] => Array ( [merchant_name] => 东方宫兰州拉面 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/2.jpg [merchant_avg_price] => 1800 [merchant_location] => Array ( [0] => 120 [1] => 120 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 3 [product_name] => 牛肉炒面 [product_type] => 面食 [product_img] => http://product.com/4.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 4 [product_name] => 蛋炒饭 [product_type] => 主食 [product_img] => http://product.com/5.jpg [product_sold] => 14 [product_price] => 2300 ) [2] => Array ( [product_id] => 5 [product_name] => 羊肉汤 [product_type] => 汤粉 [product_img] => http://product.com/6.jpg [product_sold] => 10 [product_price] => 2200 ) ) ) ) [1] => Array ( [_index] => basic [_type] => merchant [_id] => 1 [_score] => 2.155528 [_source] => Array ( [merchant_name] => 鑫明明拉面 [merchant_score] => 4 [merchant_type] => 美食 [merchant_img] => http://merchant.com/1.jpg [merchant_avg_price] => 2100 [merchant_location] => Array ( [0] => 127 [1] => 128 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 1 [product_name] => 羊肉烩面 [product_type] => 面食 [product_img] => http://product.com/2.jpg [product_sold] => 11 [product_price] => 2200 ) [1] => Array ( [product_id] => 2 [product_name] => 烤羊肉串 [product_type] => 烤串 [product_img] => http://product.com/3.jpg [product_sold] => 12 [product_price] => 2300 ) ) ) ) [2] => Array ( [_index] => basic [_type] => merchant [_id] => 3 [_score] => 1.1084312 [_source] => Array ( [merchant_name] => 开海饭店 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/3.jpg [merchant_avg_price] => 3500 [merchant_location] => Array ( [0] => 50 [1] => 50 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 6 [product_name] => 海鲜炒饭 [product_type] => 主食 [product_img] => http://product.com/7.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 7 [product_name] => 西红柿鸡蛋面 [product_type] => 面食 [product_img] => http://product.com/8.jpg [product_sold] => 10 [product_price] => 2300 ) [2] => Array ( [product_id] => 8 [product_name] => 鸭血粉丝汤 [product_type] => 汤粉 [product_img] => http://product.com/9.jpg [product_sold] => 10 [product_price] => 2200 ) [3] => Array ( [product_id] => 9 [product_name] => 兰州炒饭 [product_type] => 主食 [product_img] => http://product.com/10.jpg [product_sold] => 15 [product_price] => 2500 ) ) ) ) ) ) ) |
最佳子句
接下来我替换搜索关键字为:”兰炒饭”,其实我本意是”兰州炒饭”,只不过我输错了(如果搜索仍旧可以给我理想的结果,我会爱上它)。
我们直接看查询结果:
|
[work@df6c675da97e nuomi-search]$ php main.php Array ( [took] => 17 [timed_out] => [_shards] => Array ( [total] => 3 [successful] => 3 [failed] => 0 ) [hits] => Array ( [total] => 3 [max_score] => 2.666227 [hits] => Array ( [0] => Array ( [_index] => basic [_type] => merchant [_id] => 2 [_score] => 2.666227 [_source] => Array ( [merchant_name] => 东方宫兰州拉面 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/2.jpg [merchant_avg_price] => 1800 [merchant_location] => Array ( [0] => 120 [1] => 120 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 3 [product_name] => 牛肉炒面 [product_type] => 面食 [product_img] => http://product.com/4.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 4 [product_name] => 蛋炒饭 [product_type] => 主食 [product_img] => http://product.com/5.jpg [product_sold] => 14 [product_price] => 2300 ) [2] => Array ( [product_id] => 5 [product_name] => 羊肉汤 [product_type] => 汤粉 [product_img] => http://product.com/6.jpg [product_sold] => 10 [product_price] => 2200 ) ) ) ) [1] => Array ( [_index] => basic [_type] => merchant [_id] => 1 [_score] => 2.155528 [_source] => Array ( [merchant_name] => 鑫明明拉面 [merchant_score] => 4 [merchant_type] => 美食 [merchant_img] => http://merchant.com/1.jpg [merchant_avg_price] => 2100 [merchant_location] => Array ( [0] => 127 [1] => 128 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 1 [product_name] => 羊肉烩面 [product_type] => 面食 [product_img] => http://product.com/2.jpg [product_sold] => 11 [product_price] => 2200 ) [1] => Array ( [product_id] => 2 [product_name] => 烤羊肉串 [product_type] => 烤串 [product_img] => http://product.com/3.jpg [product_sold] => 12 [product_price] => 2300 ) ) ) ) [2] => Array ( [_index] => basic [_type] => merchant [_id] => 3 [_score] => 1.76352 [_source] => Array ( [merchant_name] => 开海饭店 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/3.jpg [merchant_avg_price] => 3500 [merchant_location] => Array ( [0] => 50 [1] => 50 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 6 [product_name] => 海鲜炒饭 [product_type] => 主食 [product_img] => http://product.com/7.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 7 [product_name] => 西红柿鸡蛋面 [product_type] => 面食 [product_img] => http://product.com/8.jpg [product_sold] => 10 [product_price] => 2300 ) [2] => Array ( [product_id] => 8 [product_name] => 鸭血粉丝汤 [product_type] => 汤粉 [product_img] => http://product.com/9.jpg [product_sold] => 10 [product_price] => 2200 ) [3] => Array ( [product_id] => 9 [product_name] => 兰州炒饭 [product_type] => 主食 [product_img] => http://product.com/10.jpg [product_sold] => 15 [product_price] => 2500 ) ) ) ) ) ) ) |
从直接上来看,”开海饭店”应排在第一位,因为它正在售卖我的最爱:”兰州炒饭”,可为什么”东方宫兰州拉面”在第一位呢?
之前说过,bool的should会对其内部的2个子句(一个匹配商铺名称,一个匹配商品名称)的相关性加和并除以子句个数(这里有2个子句),其结果作为商铺文档的总相关性。
- “开海饭店”是商铺标题,和”兰炒饭”没有一丁点相关性,因此第一个子句的相关性=0。
- “兰州炒饭”完美匹配我的查询,因此第二个子句有很高的相关性。
- 总相关性 = (0 + 一个很高的相关性)/ 2,变成了很高相关性的一半。
- “东方宫兰州拉面”是商铺标题,出现了”兰”,因此第一个子句的相关性还不错。
- “蛋炒饭”出现了”炒饭”,因此第二个子句的相关性还不错。
- 总相关性 = (一个不错的相关性 + 一个不错的相关性)/2,还是一个不错的相关性。
上面的查询结果就是这样的情况下产生的,完全不符合预期!
我的初衷是找到最符合搜索关键字的字段,无论它是”商铺名称”还是”商品名称”。
“最佳字段“就是解决这个问题的:它保留多个检索字段中最大的相关性作为总相关性,查询变化如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
<?php require_once __DIR__ . "/vendor/autoload.php"; // 客户端 $client = Elasticsearch\ClientBuilder::fromConfig([ 'hosts' => ['localhost:9200', 'localhost:9201', 'localhost:9203'], // 最好在为ES集群搭建Haproxy反向代理 'retries' => 2 ]); // 搜索关键字 $keyword = '兰拉面'; // 嵌套查询 $result = $client->search([ 'index' => 'basic', // 数据库 'type' => 'merchant', // 表 'search_type' => 'dfs_query_then_fetch', // 汇总IDF计算相关 'body' => [ // 查询体 'query' => [ // 查询请求,影响相关性打分 'dis_max' => [ // 最佳字段 'queries' => [ // 取最大的相关性 // 第1项 [ // 全文匹配 'match' => ['merchant_name' => $keyword], // 商铺名 ], // 第2项 [ // 嵌套查询 'nested' => [ 'path' => 'merchant_product', // 子文档的路径 'score_mode' => 'max', // 子文档的评分方式(max表示取最多个子文档中最匹配的那个的相关性) 'query' => [ // 子文档查询请求,影响相关性打分 'match' => [ // 全文匹配 'merchant_product.product_name' => $keyword, // 商品名(必须全路径) ] ] ] ] ] ] ], ] ]); print_r($result); |
主要做了如下调整:
- bool组合查询替换成了dis_max最佳字段查询。
- should替换成了queries,下面同样包含多个查询子句。
现在结果正确!
|
[work@df6c675da97e nuomi-search]$ php main.php Array ( [took] => 41 [timed_out] => [_shards] => Array ( [total] => 3 [successful] => 3 [failed] => 0 ) [hits] => Array ( [total] => 3 [max_score] => 1.76352 [hits] => Array ( [0] => Array ( [_index] => basic [_type] => merchant [_id] => 3 [_score] => 1.76352 [_source] => Array ( [merchant_name] => 开海饭店 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/3.jpg [merchant_avg_price] => 3500 [merchant_location] => Array ( [0] => 50 [1] => 50 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 6 [product_name] => 海鲜炒饭 [product_type] => 主食 [product_img] => http://product.com/7.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 7 [product_name] => 西红柿鸡蛋面 [product_type] => 面食 [product_img] => http://product.com/8.jpg [product_sold] => 10 [product_price] => 2300 ) [2] => Array ( [product_id] => 8 [product_name] => 鸭血粉丝汤 [product_type] => 汤粉 [product_img] => http://product.com/9.jpg [product_sold] => 10 [product_price] => 2200 ) [3] => Array ( [product_id] => 9 [product_name] => 兰州炒饭 [product_type] => 主食 [product_img] => http://product.com/10.jpg [product_sold] => 15 [product_price] => 2500 ) ) ) ) [1] => Array ( [_index] => basic [_type] => merchant [_id] => 2 [_score] => 1.6903362 [_source] => Array ( [merchant_name] => 东方宫兰州拉面 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/2.jpg [merchant_avg_price] => 1800 [merchant_location] => Array ( [0] => 120 [1] => 120 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 3 [product_name] => 牛肉炒面 [product_type] => 面食 [product_img] => http://product.com/4.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 4 [product_name] => 蛋炒饭 [product_type] => 主食 [product_img] => http://product.com/5.jpg [product_sold] => 14 [product_price] => 2300 ) [2] => Array ( [product_id] => 5 [product_name] => 羊肉汤 [product_type] => 汤粉 [product_img] => http://product.com/6.jpg [product_sold] => 10 [product_price] => 2200 ) ) ) ) [2] => Array ( [_index] => basic [_type] => merchant [_id] => 1 [_score] => 1.1084312 [_source] => Array ( [merchant_name] => 鑫明明拉面 [merchant_score] => 4 [merchant_type] => 美食 [merchant_img] => http://merchant.com/1.jpg [merchant_avg_price] => 2100 [merchant_location] => Array ( [0] => 127 [1] => 128 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 1 [product_name] => 羊肉烩面 [product_type] => 面食 [product_img] => http://product.com/2.jpg [product_sold] => 11 [product_price] => 2200 ) [1] => Array ( [product_id] => 2 [product_name] => 烤羊肉串 [product_type] => 烤串 [product_img] => http://product.com/3.jpg [product_sold] => 12 [product_price] => 2300 ) ) ) ) ) ) ) |
过滤距离
通常,我们希望找到附近N公里内的商铺,因此必须利用坐标进行筛选,ES提供了索引地理位置的能力。
假设我的坐标是(X,Y),搜索范围是以它为圆心,半径为1公里的圆形,那么ES会怎么做呢?
ES首先为每个商铺的merchant_location建立了索引,(X,Y)坐标将被建立2个索引:
- 按经度索引。
- 按纬度索引。
ES在执行查询时,首先以(X,Y)为中心画一个矩形,它恰好能够包裹圆形,这样的目的是可以利用2个索引快速缩小范围:
- 矩形的x轴区间范围,可以使用经度索引筛选出一批X轴在1公里范围内的文档。
- 矩形的y轴区间范围,可以使用纬度索引筛选出一批Y轴在1公里范围内的文档。
- 两个文档集合求交集,可以得到矩形范围内的所有文档。
- 矩形比圆形要多一些区域,因此遍历所有文档计算它们和(X,Y)之间的距离,将圆形外的点删除。
这种工作方式叫做”地理坐标盒模型“,它是一种精度高,计算耗费资源比较多的一种手段(另外一种精度低,资源耗费少的方式是geohash)。
下面的请求,首先利用”距离”过滤出1KM内的商铺,之后再基于过滤的结果进行全文检索并计算相关性:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
<?php require_once __DIR__ . "/vendor/autoload.php"; // 客户端 $client = Elasticsearch\ClientBuilder::fromConfig([ 'hosts' => ['localhost:9200', 'localhost:9201', 'localhost:9203'], // 最好在为ES集群搭建Haproxy反向代理 'retries' => 2 ]); // 搜索关键字 $keyword = '拉面'; // 嵌套查询 $result = $client->search([ 'index' => 'basic', // 数据库 'type' => 'merchant', // 表 'search_type' => 'dfs_query_then_fetch', // 汇总IDF计算相关 'body' => [ // 查询体 'query' => [ // 组合 'bool' => [ 'must' => [ // 查询请求,影响相关性打分 'dis_max' => [ // 布尔组合 'queries' => [ // 各个子句相当于或的关系 // 第1项 [ // 全文匹配 'match' => ['merchant_name' => $keyword], // 商铺名 ], // 第2项 [ // 嵌套查询 'nested' => [ 'path' => 'merchant_product', // 子文档的路径 'score_mode' => 'max', // 子文档的评分方式(max表示取最多个子文档中最匹配的那个的相关性) 'query' => [ // 子文档查询请求,影响相关性打分 'match' => [ // 全文匹配 'merchant_product.product_name' => $keyword, // 商品名(必须全路径) ] ] ] ] ] ] ], // 过滤 'filter' => [ // 地理距离过滤器 'geo_distance' => [ 'distance' => '1km', 'merchant_location' => [ 120.3887320000, 36.0683290000 ] ] ] ] ], ] ]); print_r($result); |
这里使用了过滤,它在全文检索之前对数据进行按条件筛选,过滤的结果可以被ES缓存。
在ES5.x版本中,过滤语法filter必须和全文检索放在一个bool组合中,而全文检索放在must中即可。
下面是结果,它们按相关性排序:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
[work@df6c675da97e nuomi-search]$ php main.php Array ( [took] => 89 [timed_out] => [_shards] => Array ( [total] => 3 [successful] => 3 [failed] => 0 ) [hits] => Array ( [total] => 2 [max_score] => 1.1084312 [hits] => Array ( [0] => Array ( [_index] => basic [_type] => merchant [_id] => 1 [_score] => 1.1084312 [_source] => Array ( [merchant_name] => 鑫明明拉面 [merchant_score] => 4 [merchant_type] => 美食 [merchant_img] => http://merchant.com/1.jpg [merchant_avg_price] => 2100 [merchant_location] => Array ( [0] => 120.394589 [1] => 36.070517 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 1 [product_name] => 羊肉烩面 [product_type] => 面食 [product_img] => http://product.com/2.jpg [product_sold] => 11 [product_price] => 2200 ) [1] => Array ( [product_id] => 2 [product_name] => 烤羊肉串 [product_type] => 烤串 [product_img] => http://product.com/3.jpg [product_sold] => 12 [product_price] => 2300 ) ) ) ) [1] => Array ( [_index] => basic [_type] => merchant [_id] => 2 [_score] => 0.97589093 [_source] => Array ( [merchant_name] => 东方宫兰州拉面 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/2.jpg [merchant_avg_price] => 1800 [merchant_location] => Array ( [0] => 120.383579 [1] => 36.071833 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 3 [product_name] => 牛肉炒面 [product_type] => 面食 [product_img] => http://product.com/4.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 4 [product_name] => 蛋炒饭 [product_type] => 主食 [product_img] => http://product.com/5.jpg [product_sold] => 14 [product_price] => 2300 ) [2] => Array ( [product_id] => 5 [product_name] => 羊肉汤 [product_type] => 汤粉 [product_img] => http://product.com/6.jpg [product_sold] => 10 [product_price] => 2200 ) ) ) ) ) ) ) |
现在扩大距离范围distance为2km,可以看到三个”商铺”全部返回(我就不贴结果了,亲自动手试试吧)。
排序
在糯米搜索中,”综合排序”其实就是指相关性排序,是ES的默认排序方法。
但是仔细观察糯米检索会发现,它支持若干其他排序方式,比如:按距离排序。
新的查询需求如下:搜索2KM之内,与”拉面”相关的”店铺”,并且按照距离远近排序。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
<?php require_once __DIR__ . "/vendor/autoload.php"; // 客户端 $client = Elasticsearch\ClientBuilder::fromConfig([ 'hosts' => ['localhost:9200', 'localhost:9201', 'localhost:9203'], // 最好在为ES集群搭建Haproxy反向代理 'retries' => 2 ]); // 搜索关键字 $keyword = '拉面'; // 嵌套查询 $result = $client->search([ 'index' => 'basic', // 数据库 'type' => 'merchant', // 表 'search_type' => 'dfs_query_then_fetch', // 汇总IDF计算相关 'body' => [ // 查询体 'query' => [ // 组合 'bool' => [ 'must' => [ // 查询请求,影响相关性打分 'dis_max' => [ // 布尔组合 'queries' => [ // 各个子句相当于或的关系 // 第1项 [ // 全文匹配 'match' => ['merchant_name' => $keyword], // 商铺名 ], // 第2项 [ // 嵌套查询 'nested' => [ 'path' => 'merchant_product', // 子文档的路径 'score_mode' => 'max', // 子文档的评分方式(max表示取最多个子文档中最匹配的那个的相关性) 'query' => [ // 子文档查询请求,影响相关性打分 'match' => [ // 全文匹配 'merchant_product.product_name' => $keyword, // 商品名(必须全路径) ] ] ] ] ] ] ], // 过滤 'filter' => [ // 地理距离过滤器 'geo_distance' => [ 'distance' => '2km', 'merchant_location' => [ 120.3887320000, 36.0683290000 ] ] ] ] ], // 排序 'sort' => [ [ '_geo_distance' => [ // 计算与这个点之间的距离 'merchant_location' => [ 120.3887320000, 36.0683290000 ], // 距离近的排列在前面 'order' => 'asc', // 返回单位是km 'unit' => 'km', ] ] ] ] ]); print_r($result); |
- sort是必须要写的。
- sort内部可以并列多个排序条件。
- _geo_distance是坐标排序的系统关键字。
结果如下:
|
[work@df6c675da97e nuomi-search]$ php main.php Array ( [took] => 68 [timed_out] => [_shards] => Array ( [total] => 3 [successful] => 3 [failed] => 0 ) [hits] => Array ( [total] => 3 [max_score] => [hits] => Array ( [0] => Array ( [_index] => basic [_type] => merchant [_id] => 1 [_score] => [_source] => Array ( [merchant_name] => 鑫明明拉面 [merchant_score] => 4 [merchant_type] => 美食 [merchant_img] => http://merchant.com/1.jpg [merchant_avg_price] => 2100 [merchant_location] => Array ( [0] => 120.394589 [1] => 36.070517 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 1 [product_name] => 羊肉烩面 [product_type] => 面食 [product_img] => http://product.com/2.jpg [product_sold] => 11 [product_price] => 2200 ) [1] => Array ( [product_id] => 2 [product_name] => 烤羊肉串 [product_type] => 烤串 [product_img] => http://product.com/3.jpg [product_sold] => 12 [product_price] => 2300 ) ) ) [sort] => Array ( [0] => 0.57992238133363 ) ) [1] => Array ( [_index] => basic [_type] => merchant [_id] => 2 [_score] => [_source] => Array ( [merchant_name] => 东方宫兰州拉面 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/2.jpg [merchant_avg_price] => 1800 [merchant_location] => Array ( [0] => 120.383579 [1] => 36.071833 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 3 [product_name] => 牛肉炒面 [product_type] => 面食 [product_img] => http://product.com/4.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 4 [product_name] => 蛋炒饭 [product_type] => 主食 [product_img] => http://product.com/5.jpg [product_sold] => 14 [product_price] => 2300 ) [2] => Array ( [product_id] => 5 [product_name] => 羊肉汤 [product_type] => 汤粉 [product_img] => http://product.com/6.jpg [product_sold] => 10 [product_price] => 2200 ) ) ) [sort] => Array ( [0] => 0.60523716061392 ) ) [2] => Array ( [_index] => basic [_type] => merchant [_id] => 3 [_score] => [_source] => Array ( [merchant_name] => 开海饭店 [merchant_score] => 3 [merchant_type] => 美食 [merchant_img] => http://merchant.com/3.jpg [merchant_avg_price] => 3500 [merchant_location] => Array ( [0] => 120.405117 [1] => 36.0683 ) [merchant_product] => Array ( [0] => Array ( [product_id] => 6 [product_name] => 海鲜炒饭 [product_type] => 主食 [product_img] => http://product.com/7.jpg [product_sold] => 10 [product_price] => 2400 ) [1] => Array ( [product_id] => 7 [product_name] => 西红柿鸡蛋面 [product_type] => 面食 [product_img] => http://product.com/8.jpg [product_sold] => 10 [product_price] => 2300 ) [2] => Array ( [product_id] => 8 & |