2014-09-29 82 views
-1

我只想复制集合,但删除多余的条目。我能做到这一点的一种方法是导出整个集合,并将其中一个字段更改为_id。此外,我可以复制收藏并索引该字段以消除冗余,但有太多的方法。使删除冗余的新集合

但是,有没有更优雅的解决方案?也许我可以做这样的事情。

db.coll.distinct('Query Sequence').forEach(
     function(x){ 
      db.newcollection.insert(db.coll.findOne({'Query Sequence':x}))}) 

当然这不起作用......但是没有人有任何解决方案来解决这个问题吗?

编辑 - 它不工作的原因是因为不同的数组也是很大的。

编辑2 - 这是它的样子。

db.coll.find({ '查询序列': 'ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG'})

{ 
    "_id" : ObjectId("5424b996ce5254437868c1c9"), 
    "Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_2557_16557_7", 
    "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG", 
    "Chain type" : "VH", 
    "Format Type" : "imgt", 
    "Species" : "human", 
    "Top V Hit" : "IGHV1-2*01", 
    "Top D Hit" : "N/A", 
    "Top J Hit" : "IGHJ4*01", 
    "Productive" : "Yes", 
    "Productive CDR3" : "True", 
    "Strand" : "-", 
    "Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT", 
    "Framework 2 Nucleotides" : "AAGCCTTGCA", 
    "Framework 4 Nucleotides" : "TGCAG", 
    "CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG", 
    "CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG", 
    "Framework 1 AA" : "GWGGCTPQYI", 
    "Framework 2 AA" : "KPC", 
    "Framework 4 AA" : "C", 
    "Framework 1 AA Length" : 10, 
    "Framework 2 AA Length" : 3, 
    "Framework 4 AA Length" : 1, 
    "CDR1 AA" : "SSR*RCIQ", 
    "CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL", 
    "CDR1 AA Length" : 8, 
    "CDR3 AA Length" : 21, 
    "Total V Alignment Matches" : 64, 
    "Total V Alignment Mismatches" : 1, 
    "Total V Alignment Length" : 65, 
    "Total V Alignment Gaps" : 0, 
    "Total V Alignment Identity" : 98.5, 
    "FW1 Alignment From" : 7, 
    "FW1 Alignment To" : 37, 
    "FW1 Alignment Matches" : 31, 
    "FW1 Alignment Mismatches" : 0, 
    "FW1 Alignment Length" : 31, 
    "FW1 Alignment Gaps" : 0, 
    "FW1 Alignment Identity" : 100, 
    "FW2 Alignment From" : 62, 
    "FW2 Alignment To" : 71, 
    "FW2 Alignment Matches" : 9, 
    "FW2 Alignment Mismatches" : 1, 
    "FW2 Alignment Length" : 10, 
    "FW2 Alignment Gaps" : 0, 
    "FW2 Alignment Identity" : 90, 
    "CDR1 Alignment From" : 38, 
    "CDR1 Alignment To" : 61, 
    "CDR1 Alignment Matches" : 24, 
    "CDR1 Alignment Mismatches" : 0, 
    "CDR1 Alignment Length" : 24, 
    "CDR1 Alignment Gaps" : 0, 
    "CDR1 Alignment Identity" : 100, 
    "Junction V-End" : "CTGGG", 
    "V-D Junction" : "N/A", 
    "Junction D-Gene" : "N/A", 
    "D-J Junction" : "N/A", 
    "Junction J-Start" : "G", 
    "Junction Merged" : "CTGGGG", 
    "Stop Codon" : "No", 
    "V-J frame" : "In-frame", 
} 

{ 
    "_id" : ObjectId("5424b996ce52544378867c128"), 
    "Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_35567_85D", 
    "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG", 
    "Chain type" : "VH", 
    "Format Type" : "imgt", 
    "Species" : "human", 
    "Top V Hit" : "IGHV1-2*01", 
    "Top D Hit" : "N/A", 
    "Top J Hit" : "IGHJ4*01", 
    "Productive" : "Yes", 
    "Productive CDR3" : "True", 
    "Strand" : "-", 
    "Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT", 
    "Framework 2 Nucleotides" : "AAGCCTTGCA", 
    "Framework 4 Nucleotides" : "TGCAG", 
    "CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG", 
    "CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG", 
    "Framework 1 AA" : "GWGGCTPQYI", 
    "Framework 2 AA" : "KPC", 
    "Framework 4 AA" : "C", 
    "Framework 1 AA Length" : 10, 
    "Framework 2 AA Length" : 3, 
    "Framework 4 AA Length" : 1, 
    "CDR1 AA" : "SSR*RCIQ", 
    "CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL", 
    "CDR1 AA Length" : 8, 
    "CDR3 AA Length" : 21, 
    "Total V Alignment Matches" : 64, 
    "Total V Alignment Mismatches" : 1, 
    "Total V Alignment Length" : 65, 
    "Total V Alignment Gaps" : 0, 
    "Total V Alignment Identity" : 98.5, 
    "FW1 Alignment From" : 7, 
    "FW1 Alignment To" : 37, 
    "FW1 Alignment Matches" : 31, 
    "FW1 Alignment Mismatches" : 0, 
    "FW1 Alignment Length" : 31, 
    "FW1 Alignment Gaps" : 0, 
    "FW1 Alignment Identity" : 100, 
    "FW2 Alignment From" : 62, 
    "FW2 Alignment To" : 71, 
    "FW2 Alignment Matches" : 9, 
    "FW2 Alignment Mismatches" : 1, 
    "FW2 Alignment Length" : 10, 
    "FW2 Alignment Gaps" : 0, 
    "FW2 Alignment Identity" : 90, 
    "CDR1 Alignment From" : 38, 
    "CDR1 Alignment To" : 61, 
    "CDR1 Alignment Matches" : 24, 
    "CDR1 Alignment Mismatches" : 0, 
    "CDR1 Alignment Length" : 24, 
    "CDR1 Alignment Gaps" : 0, 
    "CDR1 Alignment Identity" : 100, 
    "Junction V-End" : "CTGGG", 
    "V-D Junction" : "N/A", 
    "Junction D-Gene" : "N/A", 
    "D-J Junction" : "N/A", 
    "Junction J-Start" : "G", 
    "Junction Merged" : "CTGGGG", 
    "Stop Codon" : "No", 
    "V-J frame" : "In-frame", 
} 

正如你所看到的,一切都是除的ObjectId和序列编号相同。我只想在新集合中使用一个文档。我正在使用Mongo 2.6.4

+0

有几种不同的方法可以回答这个问题。但大多数情况下,“对你来说,什么是真正的'冗余'情况?”。您可以通过显示一些样本数据和预期结果来更清楚地说明问题。方法也会根据您的MongoDB版本而有所不同。 – 2014-09-29 03:07:15

+0

为什么downvote?有礼貌解释原因 – jwillis0720 2014-09-29 07:04:19

回答

1

我建议您尝试聚合框架。 下面的mongo shell程序显示了如何执行此操作, 以及一些其他简化示例文档,以获得更完整的说明。 请注意某些文档中“序列标识”的第一个字符更改。

管道阶段是:

  1. $降序排序订购较高的“序列ID”的“查询序列”第一
  2. $组和积累第一DOC(具有较高的“序列ID”)在每个组
  3. $项目字段回到顶层
  4. $出来保存到一个新的集合

这是假设你的实况都有相同的领域。 的东西少规律, 你将不得不通过一个客户端程序的往返数据 - 消除$项目和$出的管道, 批处理过程在你的客户端程序, 和doc现场手动投射到顶层。

您将需要额外的磁盘空间来执行此操作。 对于运行聚合框架时的临时空间,允许至少2x, 1x,对于新的收集结果,允许至少1x用于临时空间, 和1x。

对于机制的文档,请参阅http://docs.mongodb.org/manual/core/aggregation-pipeline/

希望这有助于。

聚集组第一-last.js:

var docs = [ 
    {"Sequence Id":"M02331_41_000000000_AAW8D_1_1108_2557_16557_7","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG","ChainType":"VH"}, 
    {"Sequence Id":"M02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG","ChainType":"VH"}, 
    {"Sequence Id":"B02331_41_000000000_AAW8D_1_1108_2557_16557_7","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG","ChainType":"VH"}, 
    {"Sequence Id":"A02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG","ChainType":"VH"}, 
    {"Sequence Id":"C02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGAT","ChainType":"VH"} 
]; 
db.test.remove({}); 
db.test.save(docs); 
var result = db.test.find().toArray(); 
var keys = Object.keys(result[0]); 
var project = {}; 
for (i = 0; i < keys.length; i++) { 
    project[keys[i]] = "$doc." + keys[i]; 
} 
printjson(project); 
function pipelineWithOut(out) { 
    pipeline = [ 
     {"$sort": {"Sequence Id": -1}}, 
     {"$group": {_id: "$Query Sequence", doc: {"$first": "$$ROOT"}}}, 
     {"$project": project}, 
     {"$out": out} 
    ]; 
    printjson(pipeline); 
    return pipeline; 
} 
db.testFirst.drop(); 
db.test.aggregate(pipelineWithOut("testFirst"), {allowDiskUse: true}); 
printjson(db.testFirst.find().toArray()); 

$蒙戈总组第一 - 最后。js

MongoDB shell version: 2.6.4 
connecting to: test 
{ 
    "_id" : "$doc._id", 
    "Sequence Id" : "$doc.Sequence Id", 
    "Query Sequence" : "$doc.Query Sequence", 
    "ChainType" : "$doc.ChainType" 
} 
[ 
    { 
     "$sort" : { 
      "Sequence Id" : -1 
     } 
    }, 
    { 
     "$group" : { 
      "_id" : "$Query Sequence", 
      "doc" : { 
       "$first" : "$$ROOT" 
      } 
     } 
    }, 
    { 
     "$project" : { 
      "_id" : "$doc._id", 
      "Sequence Id" : "$doc.Sequence Id", 
      "Query Sequence" : "$doc.Query Sequence", 
      "ChainType" : "$doc.ChainType" 
     } 
    }, 
    { 
     "$out" : "testFirst" 
    } 
] 
[ 
    { 
     "_id" : ObjectId("54299b557d7122b60724e5f5"), 
     "Sequence Id" : "B02331_41_000000000_AAW8D_1_1108_2557_16557_7", 
     "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG", 
     "ChainType" : "VH" 
    }, 
    { 
     "_id" : ObjectId("54299b557d7122b60724e5f7"), 
     "Sequence Id" : "C02331_41_000000000_AAW8D_1_1108_35567_85D", 
     "Query Sequence" : "ATCTACGGTTGGGGCGGAT", 
     "ChainType" : "VH" 
    }, 
    { 
     "_id" : ObjectId("54299b557d7122b60724e5f4"), 
     "Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_35567_85D", 
     "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG", 
     "ChainType" : "VH" 
    } 
]