我只想复制集合,但删除多余的条目。我能做到这一点的一种方法是导出整个集合,并将其中一个字段更改为_id。此外,我可以复制收藏并索引该字段以消除冗余,但有太多的方法。使删除冗余的新集合
但是,有没有更优雅的解决方案?也许我可以做这样的事情。
db.coll.distinct('Query Sequence').forEach(
function(x){
db.newcollection.insert(db.coll.findOne({'Query Sequence':x}))})
当然这不起作用......但是没有人有任何解决方案来解决这个问题吗?
编辑 - 它不工作的原因是因为不同的数组也是很大的。
编辑2 - 这是它的样子。
db.coll.find({ '查询序列': 'ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG'})
{
"_id" : ObjectId("5424b996ce5254437868c1c9"),
"Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_2557_16557_7",
"Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
"Chain type" : "VH",
"Format Type" : "imgt",
"Species" : "human",
"Top V Hit" : "IGHV1-2*01",
"Top D Hit" : "N/A",
"Top J Hit" : "IGHJ4*01",
"Productive" : "Yes",
"Productive CDR3" : "True",
"Strand" : "-",
"Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT",
"Framework 2 Nucleotides" : "AAGCCTTGCA",
"Framework 4 Nucleotides" : "TGCAG",
"CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG",
"CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG",
"Framework 1 AA" : "GWGGCTPQYI",
"Framework 2 AA" : "KPC",
"Framework 4 AA" : "C",
"Framework 1 AA Length" : 10,
"Framework 2 AA Length" : 3,
"Framework 4 AA Length" : 1,
"CDR1 AA" : "SSR*RCIQ",
"CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL",
"CDR1 AA Length" : 8,
"CDR3 AA Length" : 21,
"Total V Alignment Matches" : 64,
"Total V Alignment Mismatches" : 1,
"Total V Alignment Length" : 65,
"Total V Alignment Gaps" : 0,
"Total V Alignment Identity" : 98.5,
"FW1 Alignment From" : 7,
"FW1 Alignment To" : 37,
"FW1 Alignment Matches" : 31,
"FW1 Alignment Mismatches" : 0,
"FW1 Alignment Length" : 31,
"FW1 Alignment Gaps" : 0,
"FW1 Alignment Identity" : 100,
"FW2 Alignment From" : 62,
"FW2 Alignment To" : 71,
"FW2 Alignment Matches" : 9,
"FW2 Alignment Mismatches" : 1,
"FW2 Alignment Length" : 10,
"FW2 Alignment Gaps" : 0,
"FW2 Alignment Identity" : 90,
"CDR1 Alignment From" : 38,
"CDR1 Alignment To" : 61,
"CDR1 Alignment Matches" : 24,
"CDR1 Alignment Mismatches" : 0,
"CDR1 Alignment Length" : 24,
"CDR1 Alignment Gaps" : 0,
"CDR1 Alignment Identity" : 100,
"Junction V-End" : "CTGGG",
"V-D Junction" : "N/A",
"Junction D-Gene" : "N/A",
"D-J Junction" : "N/A",
"Junction J-Start" : "G",
"Junction Merged" : "CTGGGG",
"Stop Codon" : "No",
"V-J frame" : "In-frame",
}
{
"_id" : ObjectId("5424b996ce52544378867c128"),
"Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_35567_85D",
"Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
"Chain type" : "VH",
"Format Type" : "imgt",
"Species" : "human",
"Top V Hit" : "IGHV1-2*01",
"Top D Hit" : "N/A",
"Top J Hit" : "IGHJ4*01",
"Productive" : "Yes",
"Productive CDR3" : "True",
"Strand" : "-",
"Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT",
"Framework 2 Nucleotides" : "AAGCCTTGCA",
"Framework 4 Nucleotides" : "TGCAG",
"CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG",
"CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG",
"Framework 1 AA" : "GWGGCTPQYI",
"Framework 2 AA" : "KPC",
"Framework 4 AA" : "C",
"Framework 1 AA Length" : 10,
"Framework 2 AA Length" : 3,
"Framework 4 AA Length" : 1,
"CDR1 AA" : "SSR*RCIQ",
"CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL",
"CDR1 AA Length" : 8,
"CDR3 AA Length" : 21,
"Total V Alignment Matches" : 64,
"Total V Alignment Mismatches" : 1,
"Total V Alignment Length" : 65,
"Total V Alignment Gaps" : 0,
"Total V Alignment Identity" : 98.5,
"FW1 Alignment From" : 7,
"FW1 Alignment To" : 37,
"FW1 Alignment Matches" : 31,
"FW1 Alignment Mismatches" : 0,
"FW1 Alignment Length" : 31,
"FW1 Alignment Gaps" : 0,
"FW1 Alignment Identity" : 100,
"FW2 Alignment From" : 62,
"FW2 Alignment To" : 71,
"FW2 Alignment Matches" : 9,
"FW2 Alignment Mismatches" : 1,
"FW2 Alignment Length" : 10,
"FW2 Alignment Gaps" : 0,
"FW2 Alignment Identity" : 90,
"CDR1 Alignment From" : 38,
"CDR1 Alignment To" : 61,
"CDR1 Alignment Matches" : 24,
"CDR1 Alignment Mismatches" : 0,
"CDR1 Alignment Length" : 24,
"CDR1 Alignment Gaps" : 0,
"CDR1 Alignment Identity" : 100,
"Junction V-End" : "CTGGG",
"V-D Junction" : "N/A",
"Junction D-Gene" : "N/A",
"D-J Junction" : "N/A",
"Junction J-Start" : "G",
"Junction Merged" : "CTGGGG",
"Stop Codon" : "No",
"V-J frame" : "In-frame",
}
正如你所看到的,一切都是除的ObjectId和序列编号相同。我只想在新集合中使用一个文档。我正在使用Mongo 2.6.4
有几种不同的方法可以回答这个问题。但大多数情况下,“对你来说,什么是真正的'冗余'情况?”。您可以通过显示一些样本数据和预期结果来更清楚地说明问题。方法也会根据您的MongoDB版本而有所不同。 – 2014-09-29 03:07:15
为什么downvote?有礼貌解释原因 – jwillis0720 2014-09-29 07:04:19