随着编辑,它听起来像数据是非常简单的平面记录。由于音量很高,我可能会考虑基于文件的存储 - 因此它会成为格式问题。 CSV/TSV非常易于编写,并且通过合适的流式传输阅读器(例如this one),您不必首先缓冲存储器中的所有数据。当然,如果你的显示器需要内存中的所有数据,那就出窗口了。
对于大容量,纯粹的带宽成为主要瓶颈,加上处理时间。 CSV和朋友通常非常好地压缩(gzip等),但采取CPU。就我个人而言,我是“协议缓冲区”的长期粉丝,在这里看起来很合适 - 它非常适合流式访问,带宽低,而且是二进制的 - 您可以从更小的IO成本中获益没有压缩的CPU开销。而且它更基于对象,因此无需任何额外的解释步骤就可以更轻松地应用到现有数据。
我不是完全是当然,如果我回答同样的问题,你问 - 所以我会在那里停止 - 但请随时澄清任何我有可怕的错误。
这里有一个基本的测试设备,以显示上面讨论的两种方法的一些典型用法
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text;
using ProtoBuf;
static class Program {
static void Main() {
var rand = new Random(123456);
var timeOrigin = new DateTime(2010,1,1);
Serializer.PrepareSerializer<MyFunRecord>();
Console.WriteLine("Writing .proto ...");
const int LOOP = 500000;
using (var file = File.Create("raw.data"))
{
var watch = Stopwatch.StartNew();
double total = 0;
for (int i = 0; i < LOOP; i++)
{
var obj = new MyFunRecord();
obj.Id = i;
obj.Count = rand.Next(500);
obj.Value = rand.NextDouble() * 4000;
obj.When = timeOrigin.AddDays(rand.Next(1000));
obj.Name = RandomString(rand);
Serializer.SerializeWithLengthPrefix(file, obj, PrefixStyle.Base128, Serializer.ListItemTag);
total += obj.Value;
}
watch.Stop();
Console.WriteLine(file.Length/(1024 * 1024)+ "MB");
Console.WriteLine(total + " (check)");
Console.WriteLine(watch.ElapsedMilliseconds + "ms");
}
rand = new Random(123456);
Console.WriteLine();
Console.WriteLine("Writing tsv ...");
using (var file = File.Create("raw.tsv"))
{
using (var writer = new StreamWriter(file))
{
var watch = Stopwatch.StartNew();
double total = 0;
for (int i = 0; i < LOOP; i++)
{
var obj = new MyFunRecord();
obj.Id = i;
obj.Count = rand.Next(500);
obj.Value = rand.NextDouble() * 4000;
obj.When = timeOrigin.AddDays(rand.Next(1000));
obj.Name = RandomString(rand);
Write(writer, obj);
total += obj.Value;
}
watch.Stop();
Console.WriteLine(file.Length/(1024 * 1024) + "MB");
Console.WriteLine(total + " (check)");
Console.WriteLine(watch.ElapsedMilliseconds + "ms");
}
}
Console.WriteLine();
Console.WriteLine("Reading .proto ...");
using(var file = File.OpenRead("raw.data"))
{
var watch = Stopwatch.StartNew();
double total = 0;
foreach (var obj in Serializer.DeserializeItems<MyFunRecord>(file, PrefixStyle.Base128, Serializer.ListItemTag))
{
total += obj.Value;
}
watch.Stop();
Console.WriteLine(total + " (check again)");
Console.WriteLine(watch.ElapsedMilliseconds + "ms");
}
Console.WriteLine();
Console.WriteLine("Reading tsv ...");
using (var file = File.OpenRead("raw.tsv"))
using (var reader = new StreamReader(file))
{
var watch = Stopwatch.StartNew();
double total = 0;
foreach (var obj in Read(reader))
{
total += obj.Value;
}
watch.Stop();
Console.WriteLine(total + " (check again)");
Console.WriteLine(watch.ElapsedMilliseconds + "ms");
}
}
private static void Write(TextWriter writer, MyFunRecord obj)
{
writer.Write(obj.Id);
writer.Write('\t');
writer.Write(obj.Name);
writer.Write('\t');
writer.Write(obj.When);
writer.Write('\t');
writer.Write(obj.Value);
writer.Write('\t');
writer.Write(obj.Count);
writer.WriteLine();
}
private static IEnumerable<MyFunRecord> Read(TextReader reader)
{
string line;
char[] delim = new[] { '\t' };
while ((line = reader.ReadLine()) != null)
{
string[] parts = line.Split(delim);
var obj = new MyFunRecord();
obj.Id = int.Parse(parts[0]);
obj.Name = parts[1];
obj.When = DateTime.Parse(parts[2]);
obj.Value = double.Parse(parts[3]);
obj.Count = int.Parse(parts[4]);
yield return obj;
}
}
static string RandomString(Random rand)
{
int len = rand.Next(1, 20);
var sb = new StringBuilder(len);
for (int i = 0; i < len; i++)
{
sb.Append('a' + rand.Next(26));
}
return sb.ToString();
}
}
[ProtoContract]
class MyFunRecord
{
[ProtoMember(1)]public int Id { get; set; }
[ProtoMember(2)]public string Name { get; set; }
[ProtoMember(3)] public DateTime When { get; set; }
[ProtoMember(4)] public double Value { get; set; }
[ProtoMember(5)] public int Count { get; set; }
}
这完全取决于数据的样子......你可以描述数据的性质?例如,通过“延迟加载”,你的意思是有很多连续的记录,但你一次只需要一个?或者你的意思是延迟加载内部对象? – 2011-02-18 11:06:04
@Marc:查看编辑 – 2011-02-18 11:10:42