构建内存中行的索引(需要单次遍历文件,但不是全部在内存中),然后可以随机快速访问行。
这不是鲁棒(没有验证/范围检查等),但:
import sys
BUFFER_LEN = 1024
def findNewLines(s):
retval = []
lastPos = 0
while True:
pos = s.find("\n", lastPos)
if pos >= 0:
pos += 1
retval.append(pos)
lastPos = pos
else:
break
return retval
class RandomAccessFile(object):
def __init__(self, fileName):
self.fileName = fileName
self.startPositions = [0]
with open(fileName, "rb") as f:
looking = True
fileOffset = 0
while (looking):
bytes = f.read(BUFFER_LEN)
if len(bytes) < BUFFER_LEN:
looking = False
newLines = findNewLines(bytes)
for newLine in newLines:
self.startPositions.append(fileOffset+newLine)
fileOffset += len(bytes)
def GetLine(self, index):
start, stop = self.startPositions[index],self.startPositions[index+1]-1
with open(self.fileName, "rb") as f:
f.seek(start)
return f.read((stop-start)-1)
raf = RandomAccessFile('/usr/share/dict/words')
print raf.GetLine(0)
print raf.GetLine(10)
print raf.GetLine(456)
print raf.GetLine(71015)
输出是:
python indexedFile.py
A
Aaronic
abrim
flippantness
*“是很重要的是,数据以随机馈送为了算法“*顺序磁盘I/O API不是为这种疯狂而设计的。 - 签名,对“深度学习”一无所知的人 –
我会用'set(file_obj)'做一组文件。然后使用'random.sample'来获取正确数量的随机元素。 – zondo
@zondo不会将整个文件内容加载到内存中吗? –