2016-07-26 75 views
0

当我尝试使用search:estimate获得总搜索结果时,我得到错误的结果..当我尝试从search:search解析总数时,它也给我错误或从一个页面到另一页我得到不同的总数。获取marklogic中确切总搜索结果的最佳方法

我怎样才能得到我的搜索字符串的确切数量?

--- XXXX编辑的问题------

我的数据库由JSON文件,而这些文件JSON在结构层次。例如:以下是样本,我保留在帖子的结尾..抱歉粘贴我的整个JSON结构,但我认为你明白了。

我已创建字段/字段范围指数的某些元素,例如

concept_species /species 
concept_name /name 
concept_registrar /registrar/name 
cept_scientist /scientist/name 
concept_supplier /suppliers/name 
concept_entitySubType /entitySubType 
concept_entityType /entityType 
concept_createdDate /createdDate 
concept_project /project/name 
concept_moniker /moniker 

当我搜索有其中之一的“约束”,那么我xdmp:估计是很好..但是当我没有任何的限制,这些对我的搜索字符串,然后xdmp :估计是关闭..我的搜索结果很好,但..所有的指标似乎很好?为什么是这种情况?因此我回到了总搜索结果的fn:count。

这可能与此问题无关,但为了完整起见,我添加了这个..我创建了一个自定义约束,它基本上将约束转换为json中的路径。例如:let我们说用户想要搜索一个名称为“ATCC”的供应商..因此,我没有在整个路径中输入用户,而是创建了一个自定义约束,其中将像json结构一样,并且我的constriant将其转换为实际的json路径..所以在这种情况下,搜索字符串像这样:((concept:suppliers.name:(ATCC)))),我的定制约束concept将它转换为以下CT:查询

<cts:json-property-scope-query xmlns:cts="http://marklogic.com/cts"> 
    <cts:property>suppliers</cts:property> 
    <cts:json-property-scope-query> 
    <cts:property>name</cts:property> 
    <cts:word-query> 
     <cts:text xml:lang="en">ATCC</cts:text> 
     <cts:option>case-insensitive</cts:option> 
     <cts:option>punctuation-insensitive</cts:option> 
     <cts:option>whitespace-insensitive</cts:option> 
     <cts:option>wildcarded</cts:option> 
    </cts:word-query> 
    </cts:json-property-scope-query> 
</cts:json-property-scope-query> 

这是我的JSON文件结构

{ 
    "moniker": "", 
    "entityType": "", 
    "entitySubType": "", 
    "abbvNumber": "", 
    "bioSafetyLevel": "", 
    "name": "", 
    "extCorpID": "", 
    "extLotID": "", 
    "selectAgent": "", 
    "comments": "", 
     "nucleotideSeq": { 
     "seq": "" 
     }, 
     "chains": [ 
     { 

      "chainType": "", 
      "name": "", 
      "plasmidLotID": "", 
      "stochiometry": 0, 
      "aminoAcids": [ 
      { 
       "sequence": "", 
       "predictedMatureSeqs": [ 
       { 
        "encodedChainName": "", 
        "encodedChainType": "", 
        "sequence": "", 
        "domains": [ 
        { 
         "allotype": "", 
         "domainType": "", 
         "entrezgeneID": "", 
         "geneSymbol": "", 
         "heavyChainIsoType": "", 
         "lightChainIsoType": "", 
         "name": "", 
         "regonizedAntigenFK": "", 
         "species": "", 
         "heavyChainIsoTypeMutation": "", 
         "antigens": [ 
         { 

          "antiIdiotypeType": "", 
          "antibodyAntigen": "", 
          "corporateID": "", 
          "description": "", 
          "entrezgeneID": "", 
          "geneSymbol": "", 
          "name": "", 
          "relatedProtein": "", 
          "sequence": "", 
          "species": "", 
          "type": "", 
          "externalID": "" 
         } 
         ] 
        } 
        ] 
       } 
       ], 
       "domains": [ 
       { 
        "allotype": "", 
        "domainType": "", 
        "entrezgeneID": "", 
        "geneSymbol": "", 
        "heavyChainIsoType": "", 
        "lightChainIsoType": "", 
        "name": "", 
        "regonizedAntigenFK": "", 
        "species": "", 
        "heavyChainIsoTypeMutation": "", 
        "antigens": [ 
        { 

         "antiIdiotypeType": "", 
         "antibodyAntigen": "", 
         "corporateID": "", 
         "description": "", 
         "entrezgeneID": "", 
         "geneSymbol": "", 
         "name": "", 
         "relatedProtein": "", 
         "sequence": "", 
         "species": "", 
         "type": "", 
         "externalID": "" 
        } 
        ] 
       } 
       ] 
      } 
      ], 
      "constructs": [ 
      { 
       "plasmidID": "", 
       "precursorAminoAcidSeq": "" 
      } 
      ] 
     } 
     ], 
     "supplier": { 
     "name": "", 
     "productID": "", 
     "atccCatalogNumber": "", 
     "lotID": "" 
     }, 
     "preparation": { 
     "type": "", 
     "lotIDs": [ 
      "" 
     ], 
     "amminoAcidDerivatization": "", 
     "chemicalConjugations": [ 
      { 
      "name": "", 
      "dar": "" 
      } 
     ], 
     "peptidateTreatment": "", 
     "proteinTreatment": "", 
     "purification": "", 
     "expressionSystem": "", 
     "empty": false 
     } 
    }, 
    "project": { 

     "name": "", 
     "status": "" 
    }, 
    "registrar": { 
     "username": "", 
     "email": "", 
     "name": "", 
     "upi": "", 
     "admin": false, 
     "curator": false, 
     "approvedUser": false 
    }, 
    "scientist": { 
     "username": "", 
     "email": "", 
     "name": "", 
     "upi": "", 
     "admin": false, 
     "curator": false, 
     "approvedUser": false 
    }, 
    "notebook": { 

     "elnPage": "", 
     "upi": "", 
     "location": "", 
     "subpage": "" 
    }, 
    "growthFS": { 

     "mediumUsed": "", 
     "otherComponents": "", 
     "percentCO2": 0, 
     "percentHumudity": 0, 
     "percentSerum": 0, 
     "selectionMarker": "", 
     "spinnerPlateSpeed": 0, 
     "temp": 0, 
     "drugResistance": "", 
     "growthConditions": "", 
     "passageNumber": "" 
    }, 
    "origin": { 

     "dateOfTransfection": "", 
     "hcAntibodyIsotype": "", 
     "lcAntibodyIsotype": "", 
     "parentCellLineLotID": "", 
     "parentChildRel": "", 
     "parentTissueSpecies": "", 
     "strain": "", 
     "tissueSource": "", 
     "celllineMemID": "", 
     "dateFrozen": "", 
     "strFingerprint": "", 
     "plasmidLotIDs": [ 
     "" 
     ] 
    }, 
    "miscellaneous": { 

     "expHostType": "", 
     "selEukaryote": "", 
     "selProkaryote": "", 
     "buffer": "", 
     "enotoxinLevel": "", 
     "enotoxinUnit": "", 
     "enotoxinMethod": "", 
     "concentrationLevel": "", 
     "concentrationUnit": "", 
     "concentrationMethod": "", 
     "mixture": "", 
     "proteinMw": 0 
    }, 
    "nucleotideSeq": { 
     "seq": "" 
    }, 
    "preparation": { 

     "type": "", 
     "lotIDs": [ 
     "" 
     ], 
     "amminoAcidDerivatization": "", 
     "chemicalConjugations": [ 
     { 
      "name": "", 
      "dar": "" 
     } 
     ], 
     "peptidateTreatment": "", 
     "proteinTreatment": "", 
     "purification": "", 
     "expressionSystem": "", 
     "empty": false 
    }, 
    "adc": { 

     "dars": [ 
     { 
      "value": 0, 
      "method": "", 
      "precision": "", 
      "empty": false 
     } 
     ], 
     "aggregations": [ 
     { 
      "percentAggMethod": "", 
      "percentAggValue": 0 
     } 
     ] 
    }, 
    "createdBy": "", 
    "createdDate": "", 
    "modifiedBy": "", 
    "modifiedDate": "", 
    "alternateName": "", 
    "chains": [ 
     { 

     "chainType": "", 
     "name": "", 
     "plasmidLotID": "", 
     "stochiometry": 0, 
     "aminoAcids": [ 
      { 
      "sequence": "", 
      "predictedMatureSeqs": [ 
       { 

       "avgMolWt": 0, 
       "encodedChainName": "", 
       "encodedChainType": "", 
       "length": 0, 
       "sequence": "", 
       "domains": [ 
        { 

        "allotype": "", 
        "domainType": "", 
        "domainEnd": 0, 
        "entrezgeneID": "", 
        "geneSymbol": "", 
        "heavyChainIsoType": "", 
        "lightChainIsoType": "", 
        "name": "", 
        "regonizedAntigenFK": "", 
        "species": "", 
        "domainStart": 0, 
        "heavyChainIsoTypeMutation": "", 
        "antigens": [ 
         { 

         "antiIdiotypeType": "", 
         "antibodyAntigen": "", 
         "corporateID": "", 
         "description": "", 
         "entrezgeneID": "", 
         "geneSymbol": "", 
         "name": "", 
         "relatedProtein": "", 
         "sequence": "", 
         "species": "", 
         "type": "", 
         "externalID": "" 
         } 
        ] 
        } 
       ] 
       } 
      ], 
      "domains": [ 
       { 

       "allotype": "", 
       "domainType": "", 
       "domainEnd": 0, 
       "entrezgeneID": "", 
       "geneSymbol": "", 
       "heavyChainIsoType": "", 
       "lightChainIsoType": "", 
       "name": "", 
       "regonizedAntigenFK": "", 
       "species": "", 
       "domainStart": 0, 
       "heavyChainIsoTypeMutation": "", 
       "antigens": [ 
        { 

        "antiIdiotypeType": "", 
        "antibodyAntigen": "", 
        "corporateID": "", 
        "description": "", 
        "entrezgeneID": "", 
        "geneSymbol": "", 
        "name": "", 
        "relatedProtein": "", 
        "sequence": "", 
        "species": "", 
        "type": "", 
        "externalID": "" 
        } 
       ] 
       } 
      ] 
      } 
     ], 
     "constructs": [ 
      { 
      "plasmidID": "", 
      "precursorAminoAcidSeq": "" 
      } 
     ] 
     } 
    ], 
    "orfs": [ 
     { 

     "orfEnd": 0, 
     "intronsPresent": "", 
     "orfStart": 0, 
     "promoters": [ 
      "" 
     ], 
     "aminoAcids": [ 
      { 
      "sequence": "", 
      "predictedMatureSeqs": [ 
       { 
       "encodedChainName": "", 
       "encodedChainType": "", 
       "length": 0, 
       "sequence": "", 
       "domains": [ 
        { 

        "allotype": "", 
        "domainType": "", 
        "domainEnd": 0, 
        "entrezgeneID": "", 
        "geneSymbol": "", 
        "heavyChainIsoType": "", 
        "lightChainIsoType": "", 
        "name": "", 
        "regonizedAntigenFK": "", 
        "species": "", 
        "domainStart": 0, 
        "heavyChainIsoTypeMutation": "", 
        "antigens": [ 
         { 

         "antiIdiotypeType": "", 
         "antibodyAntigen": "", 
         "corporateID": "", 
         "description": "", 
         "entrezgeneID": "", 
         "geneSymbol": "", 
         "name": "", 
         "relatedProtein": "", 
         "sequence": "", 
         "species": "", 
         "type": "", 
         "externalID": "" 
         } 
        ] 
        } 
       ] 
       } 
      ], 
      "domains": [ 
       { 
       "allotype": "", 
       "domainType": "", 
       "domainEnd": 0, 
       "entrezgeneID": "", 
       "geneSymbol": "", 
       "heavyChainIsoType": "", 
       "lightChainIsoType": "", 
       "name": "", 
       "regonizedAntigenFK": "", 
       "species": "", 
       "domainStart": 0, 
       "heavyChainIsoTypeMutation": "", 
       "antigens": [ 
        { 

        "antiIdiotypeType": "", 
        "antibodyAntigen": "", 
        "corporateID": "", 
        "description": "", 
        "entrezgeneID": "", 
        "geneSymbol": "", 
        "name": "", 
        "relatedProtein": "", 
        "sequence": "", 
        "species": "", 
        "type": "", 
        "externalID": "" 
        } 
       ] 
       } 
      ] 
      } 
     ], 
     "ncSeq": { 

      "seq": "" 
     }, 
     "label": "", 
     "note": "" 
     } 
    ], 
    "antigens": [ 
     { 

     "antiIdiotypeType": "", 
     "antibodyAntigen": "", 
     "corporateID": "", 
     "description": "", 
     "entrezgeneID": "", 
     "geneSymbol": "", 
     "name": "", 
     "relatedProtein": "", 
     "sequence": "", 
     "species": "", 
     "type": "", 
     "externalID": "" 
     } 
    ], 
    "immunogens": [ 
     { 

     "type": "", 
     "name": "", 
     "entrezgeneID": "", 
     "geneSymbol": "", 
     "corporateID": "", 
     "species": "", 
     "lotID": "", 
     "sequence": "" 
     } 
    ], 
    "suppliers": [ 
     { 

     "name": "", 
     "productID": "", 
     "atccCatalogNumber": "", 
     "lotID": "" 
     } 
    ], 
    "domains": [ 
     { 

     "allotype": "", 
     "domainType": "", 
     "domainEnd": 0, 
     "entrezgeneID": "", 
     "geneSymbol": "", 
     "heavyChainIsoType": "", 
     "lightChainIsoType": "", 
     "name": "", 
     "regonizedAntigenFK": "", 
     "species": "", 
     "domainStart": 0, 
     "heavyChainIsoTypeMutation": "", 
     "antigens": [ 
      { 

      "antiIdiotypeType": "", 
      "antibodyAntigen": "", 
      "corporateID": "", 
      "description": "", 
      "entrezgeneID": "", 
      "geneSymbol": "", 
      "name": "", 
      "relatedProtein": "", 
      "sequence": "", 
      "species": "", 
      "type": "", 
      "externalID": "" 
      } 
     ] 
     } 
} 
+1

您可能需要重构您的文档,以便它们与您的搜索查询和表达式是一对一的。但是,没有示例XML和查询,就不可能提出建议。 – wst

+0

我更新了我的问题,详细了解了我的文档结构以及我如何进行搜索 – Ravi

回答

0

这一切都是关于片段以及您看到的数字是基于片段估计的事实。如果你没有看到你的期望,那么有几个选项(改变文档,片段根/父母,过滤搜索等)。但是,如wst提到 - 举个例子,然后人们将能够给你更多直接指导..

+0

我更新了我的问题,提供了有关我的文档结构以及如何执行搜索的更多详细信息 – Ravi

0

我参加了一个性能损失,但是能使用fn:count

我使用search:search自定义的约束来解决,所以在我的情况下,所有我需要做的就是以下

fn:count(cts:search(fn:doc(), cts:query(search:parse($q, $options)))) 
+0

自定义约束不是估计与fn:count之间的差异的原因。另外,您提出的解决方案不能很好地扩展。 –

+1

还要记住过滤和未过滤的搜索。如果您的查询和所有索引都正确,那么您可以运行未经过滤的搜索,您的查询将运行得更快,我相信您的搜索:估算和搜索:搜索总数将是准确的。 –

+1

[使用fn:count与xdmp:估计值](http://docs.marklogic.com/guide/search-dev/count_estimate) - 对于考虑这种方法的任何人来说必不可少的读物。 –

1

Sam Mefford在他的评论中提供了更好的答案 - “还要记住过滤与不过滤红色搜索。如果你得到你的查询和所有索引的权利,所以你可以运行未经过滤的搜索你的查询将运行得更快,我相信你的搜索:估计和搜索:搜索总数将是准确的。“

fn:count()永远不是最优的。仅用于计算小序列,文档集,结果集等。过滤搜索也显着低于未过滤搜索。如果您调整索引,则可以得到未过滤的搜索,并返回来自搜索的精确计数:estimate,xdmp :估计和搜索:搜索分页。

0

如果您没有任何fragmentation strategy定义,xdmp:estimate应该给出正确的结果。这将比fn:count快得多。 你可以重写一样的代码 -

xdmp:估计(CTS:搜索(FN:(DOC),CTS:查询(搜索:​​解析($ Q,$选项))))

相关问题