2017-04-10 111 views
1

我想用Elasticsearch为中文实现一个简单的演示。 但是,搜索结果的相关性存在一些问题。关于Elasticsearch关于搜索结果的相关问题

我创建的映射一个新的指标:

{ 
    "tag": { 
     "mappings": { 
      "tag": { 
       "properties": { 
        "name": { 
         "type": "text", 
         "analyzer": "standard" 
        }, 
        "note": { 
         "type": "text", 
         "analyzer": "standard" 
        }, 
        "status": { 
         "type": "integer" 
        }, 
        "synonyms": { 
         "type": "text", 
         "analyzer": "standard" 
        } 
       } 
      } 
     } 
    } 
} 

和请求机身采用了查询 “美国”:

{ 
    "query" : { 
     "bool" : { 
      "must" : { 
       "multi_match" : { 
        "query" : "美国", 
        "fields" : [ "name", "synonyms" ] 
       } 
      }, 
      "filter" : { 
       "term" : { 
        "status" : 2 
       } 
      } 
     } 
    } 
} 

有两条记录 “中国” 和 “美国” 匹配查询。但是“中国”的记录得分更高。响应JSON是以下几点:

{ 
    "took": 2, 
    "timed_out": false, 
    "_shards": { 
     "total": 5, 
     "successful": 5, 
     "failed": 0 
    }, 
    "hits": { 
     "total": 2, 
     "max_score": 0.7373906, 
     "hits": [ { 
      "_index": "tag", 
      "_type": "tag", 
      "_id": "5482361185636870", 
      "_score": 0.7373906, 
      "_source": { 
       "status": 2, 
       "name": "中国", 
       "note": "", 
       "synonyms": [] 
      } 
     }, { 
      "_index": "tag", 
      "_type": "tag", 
      "_id": "5474649504748034", 
      "_score": 0.53484553, 
      "_source": { 
       "status": 2, 
       "name": "美国", 
       "note": "", 
       "synonyms": [] 
      } 
     } ] 
    } 
} 

“中国”的记录了0.7373906,但“美国”的记录只得到了0.53484553。

的结果解释:

{ 
    "hits": [ 
    { 
     "_shard": "[tag][0]", 
     "_node": "Wh9qH0bcTAaVNrsP1Aiyxg", 
     "_index": "tag", 
     "_type": "tag", 
     "_id": "5482361185636870", 
     "_score": 0.7373906, 
     "_source": { 
     "status": 2, 
     "name": "中国", 
     "note": "", 
     "synonyms": [] 
     }, 
     "_explanation": { 
     "value": 0.73739064, 
     "description": "sum of:", 
     "details": [ 
      { 
      "value": 0.73739064, 
      "description": "sum of:", 
      "details": [ 
       { 
       "value": 0.73739064, 
       "description": "max of:", 
       "details": [ 
        { 
        "value": 0.73739064, 
        "description": "sum of:", 
        "details": [ 
         { 
         "value": 0.73739064, 
         "description": "weight(name:国 in 0) [PerFieldSimilarity], result of:", 
         "details": [ 
          { 
          "value": 0.73739064, 
          "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", 
          "details": [ 
           { 
           "value": 0.6931472, 
           "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
           "details": [ 
            { 
            "value": 1, 
            "description": "docFreq", 
            "details": [] 
            }, 
            { 
            "value": 2, 
            "description": "docCount", 
            "details": [] 
            } 
           ] 
           }, 
           { 
           "value": 1.0638298, 
           "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
           "details": [ 
            { 
            "value": 1, 
            "description": "termFreq=1.0", 
            "details": [] 
            }, 
            { 
            "value": 1.2, 
            "description": "parameter k1", 
            "details": [] 
            }, 
            { 
            "value": 0.75, 
            "description": "parameter b", 
            "details": [] 
            }, 
            { 
            "value": 3, 
            "description": "avgFieldLength", 
            "details": [] 
            }, 
            { 
            "value": 2.56, 
            "description": "fieldLength", 
            "details": [] 
            } 
           ] 
           } 
          ] 
          } 
         ] 
         } 
        ] 
        } 
       ] 
       }, 
       { 
       "value": 0, 
       "description": "match on required clause, product of:", 
       "details": [ 
        { 
        "value": 0, 
        "description": "# clause", 
        "details": [] 
        }, 
        { 
        "value": 1, 
        "description": "status:[2 TO 2], product of:", 
        "details": [ 
         { 
         "value": 1, 
         "description": "boost", 
         "details": [] 
         }, 
         { 
         "value": 1, 
         "description": "queryNorm", 
         "details": [] 
         } 
        ] 
        } 
       ] 
       } 
      ] 
      }, 
      { 
      "value": 0, 
      "description": "match on required clause, product of:", 
      "details": [ 
       { 
       "value": 0, 
       "description": "# clause", 
       "details": [] 
       }, 
       { 
       "value": 1, 
       "description": "*:*, product of:", 
       "details": [ 
        { 
        "value": 1, 
        "description": "boost", 
        "details": [] 
        }, 
        { 
        "value": 1, 
        "description": "queryNorm", 
        "details": [] 
        } 
       ] 
       } 
      ] 
      } 
     ] 
     } 
    }, 
    { 
     "_shard": "[tag][4]", 
     "_node": "Wh9qH0bcTAaVNrsP1Aiyxg", 
     "_index": "tag", 
     "_type": "tag", 
     "_id": "5474649504748034", 
     "_score": 0.51623213, 
     "_source": { 
     "status": 2, 
     "name": "美国", 
     "note": "", 
     "synonyms": [] 
     }, 
     "_explanation": { 
     "value": 0.51623213, 
     "description": "sum of:", 
     "details": [ 
      { 
      "value": 0.51623213, 
      "description": "sum of:", 
      "details": [ 
       { 
       "value": 0.51623213, 
       "description": "max of:", 
       "details": [ 
        { 
        "value": 0.51623213, 
        "description": "sum of:", 
        "details": [ 
         { 
         "value": 0.25811607, 
         "description": "weight(name:美 in 0) [PerFieldSimilarity], result of:", 
         "details": [ 
          { 
          "value": 0.25811607, 
          "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", 
          "details": [ 
           { 
           "value": 0.2876821, 
           "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
           "details": [ 
            { 
            "value": 1, 
            "description": "docFreq", 
            "details": [] 
            }, 
            { 
            "value": 1, 
            "description": "docCount", 
            "details": [] 
            } 
           ] 
           }, 
           { 
           "value": 0.89722675, 
           "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
           "details": [ 
            { 
            "value": 1, 
            "description": "termFreq=1.0", 
            "details": [] 
            }, 
            { 
            "value": 1.2, 
            "description": "parameter k1", 
            "details": [] 
            }, 
            { 
            "value": 0.75, 
            "description": "parameter b", 
            "details": [] 
            }, 
            { 
            "value": 2, 
            "description": "avgFieldLength", 
            "details": [] 
            }, 
            { 
            "value": 2.56, 
            "description": "fieldLength", 
            "details": [] 
            } 
           ] 
           } 
          ] 
          } 
         ] 
         }, 
         { 
         "value": 0.25811607, 
         "description": "weight(name:国 in 0) [PerFieldSimilarity], result of:", 
         "details": [ 
          { 
          "value": 0.25811607, 
          "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", 
          "details": [ 
           { 
           "value": 0.2876821, 
           "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
           "details": [ 
            { 
            "value": 1, 
            "description": "docFreq", 
            "details": [] 
            }, 
            { 
            "value": 1, 
            "description": "docCount", 
            "details": [] 
            } 
           ] 
           }, 
           { 
           "value": 0.89722675, 
           "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
           "details": [ 
            { 
            "value": 1, 
            "description": "termFreq=1.0", 
            "details": [] 
            }, 
            { 
            "value": 1.2, 
            "description": "parameter k1", 
            "details": [] 
            }, 
            { 
            "value": 0.75, 
            "description": "parameter b", 
            "details": [] 
            }, 
            { 
            "value": 2, 
            "description": "avgFieldLength", 
            "details": [] 
            }, 
            { 
            "value": 2.56, 
            "description": "fieldLength", 
            "details": [] 
            } 
           ] 
           } 
          ] 
          } 
         ] 
         } 
        ] 
        } 
       ] 
       }, 
       { 
       "value": 0, 
       "description": "match on required clause, product of:", 
       "details": [ 
        { 
        "value": 0, 
        "description": "# clause", 
        "details": [] 
        }, 
        { 
        "value": 1, 
        "description": "status:[2 TO 2], product of:", 
        "details": [ 
         { 
         "value": 1, 
         "description": "boost", 
         "details": [] 
         }, 
         { 
         "value": 1, 
         "description": "queryNorm", 
         "details": [] 
         } 
        ] 
        } 
       ] 
       } 
      ] 
      }, 
      { 
      "value": 0, 
      "description": "match on required clause, product of:", 
      "details": [ 
       { 
       "value": 0, 
       "description": "# clause", 
       "details": [] 
       }, 
       { 
       "value": 1, 
       "description": "*:*, product of:", 
       "details": [ 
        { 
        "value": 1, 
        "description": "boost", 
        "details": [] 
        }, 
        { 
        "value": 1, 
        "description": "queryNorm", 
        "details": [] 
        } 
       ] 
       } 
      ] 
      } 
     ] 
     } 
    } 
    ] 
} 

回答

3

看来你的指数只包含几个文件,它们属于不同的碎片。每个shrad都有自己的术语频率。默认情况下ElasticSearch使用这些本地值。但是你可以通过指定search_type=dfs_query_then_fetch查询参数更改此行为或添加相应的体场这样

{ 
    "search_type": "dfs_query_then_fetch", 
    "query": { 
     "bool": { 
      "must": { 
       "multi_match": { 
        "query": "美国", 
        "fields": [ 
         "name", 
         "synonyms" 
        ] 
       } 
      }, 
      "filter": { 
       "term": { 
        "status": 2 
       } 
      } 
     } 
    } 
} 

在本文https://www.elastic.co/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch

+0

谢谢你看看。是否有另一种方法通过更改设置,映射或其他方式来解决此问题,因为我发现DFS查询然后提取可能会导致性能下降。 – LCB

+0

如果您的索引很小,且不需要可扩展性,则可以使用单个分片创建索引。如果指数较大,则术语频率不会有显着差异,并且您通常不会遇到此问题 – Random