2017-08-04 257 views
2

我正在尝试使用PyMongo构建一个Python脚本,该脚本将能够击中可以获得数量可能存在于数据库中的n个对象的精确匹配的Mongo DB。目前,我有这样的设置:在多个文档字段上的MongoDB精确匹配

db.entries.find({'$or': [<list-of-objects]}) 

凡对象的列表看起来是这样的:

[{'email': '[email protected]', 'zip': '11111'}, {'email': '[email protected]', 'zip': '11112'}, ...] 

使用$or工作好时,我有在列表中10级左右的项目。我现在正在测试100个,并且要花很长时间才能返回。我曾考虑过使用多个$in过滤器,但我不知道这是否是最佳选择。

我确定有更好的方法来处理这个问题,但我对Mongo相当陌生。

编辑:的.explain()输出如下:

{ 
    "executionStats": { 
     "executionTimeMillis": 228734, 
     "nReturned": 2, 
     "totalKeysExamined": 0, 
     "allPlansExecution": [], 
     "executionSuccess": true, 
     "executionStages": { 
      "needYield": 0, 
      "saveState": 43556, 
      "restoreState": 43556, 
      "isEOF": 1, 
      "inputStage": { 
       "needYield": 0, 
       "saveState": 43556, 
       "restoreState": 43556, 
       "isEOF": 1, 
       "inputStage": { 
        "needYield": 0, 
        "direction": "forward", 
        "saveState": 43556, 
        "restoreState": 43556, 
        "isEOF": 1, 
        "docsExamined": 5453000, 
        "nReturned": 2, 
        "needTime": 5452999, 
        "filter": { 
         "$or": [{ 
          "$and": [{ 
           "email": { 
            "$eq": "[email protected]" 
           } 
          }, { 
           "zipcode": { 
            "$eq": "11111" 
           } 
          }] 
         }, { 
          "$and": [{ 
           "email": { 
            "$eq": "[email protected]" 
           } 
          }, { 
           "zipcode": { 
            "$eq": "11112" 
           } 
          }] 
         }] 
        }, 
        "executionTimeMillisEstimate": 208083, 
        "invalidates": 0, 
        "works": 5453002, 
        "advanced": 2, 
        "stage": "COLLSCAN" 
       }, 
       "nReturned": 2, 
       "needTime": 5452999, 
       "executionTimeMillisEstimate": 211503, 
       "transformBy": { 
        "_id": false 
       }, 
       "invalidates": 0, 
       "works": 5453002, 
       "advanced": 2, 
       "stage": "PROJECTION" 
      }, 
      "nReturned": 2, 
      "needTime": 5452999, 
      "executionTimeMillisEstimate": 213671, 
      "invalidates": 0, 
      "works": 5453002, 
      "advanced": 2, 
      "stage": "SUBPLAN" 
     }, 
     "totalDocsExamined": 5453000 
    }, 
    "queryPlanner": { 
     "parsedQuery": { 
      "$or": [{ 
       "$and": [{ 
        "email": { 
         "$eq": "[email protected]" 
        } 
       }, { 
        "zipcode": { 
         "$eq": "11111" 
        } 
       }] 
      }, { 
       "$and": [{ 
        "email": { 
         "$eq": "[email protected]" 
        } 
       }, { 
        "zipcode": { 
         "$eq": "11112" 
        } 
       }] 
      }] 
     }, 
     "rejectedPlans": [], 
     "namespace": "db.entries", 
     "winningPlan": { 
      "inputStage": { 
       "transformBy": { 
        "_id": false 
       }, 
       "inputStage": { 
        "filter": { 
         "$or": [{ 
          "$and": [{ 
           "email": { 
            "$eq": "[email protected]" 
           } 
          }, { 
           "zipcode": { 
            "$eq": "11111" 
           } 
          }] 
         }, { 
          "$and": [{ 
           "email": { 
            "$eq": "[email protected]" 
           } 
          }, { 
           "zipcode": { 
            "$eq": "11112" 
           } 
          }] 
         }] 
        }, 
        "direction": "forward", 
        "stage": "COLLSCAN" 
       }, 
       "stage": "PROJECTION" 
      }, 
      "stage": "SUBPLAN" 
     }, 
     "indexFilterSet": false, 
     "plannerVersion": 1 
    }, 
    "ok": 1.0, 
    "serverInfo": { 
     "host": "somehost", 
     "version": "3.4.6", 
     "port": 27017, 
     "gitVersion": "c55eb86ef46ee7aede3b1e2a5d184a7df4bfb5b5" 
    } 
} 
+0

请添加的输出'.explain()' –

+0

@MarkusWMahlberg看到OP – xtheking

+0

查询是有点低效率的,你正在检查的文件5453000终于得到2个文件。为什么不创建1.在任何包含高基数的字段上创建索引,它可以是邮政编码或电子邮件。 2.使用聚合管道,使用您用来创建索引的字段选择文档,然后您必须使用新索引过滤掉大量文档。希望有所帮助。 – Euclides

回答

0

为了避免编制索引和重新编制索引(这个查询不仅仅涉及电子邮件/ zip,将是动态的),我使用每个标题构建数据列表并将它们用作$in参数,然后将这些参数传递给$and 。它似乎工作得很好,并没有超过3分钟的时间查询。

例子:

{'$and': [{'email': {'$in': ['[email protected]', '[email protected]', '[email protected]']}, 'zipcode': {'$in': ['12345', '11111', '11112']}}]} 
1

我建议创建一个新的索引(复合指数)在你的情况,你正在使用搜索两个领域:

db.entries.createIndex({"email": 1, "zip": 1}) 

现在在你的查询中运行你的查询附加explain()命令,你应该看到,而不是COLLSCAN它已经开始使用IXSCAN。