Merge pull request #4762

399cdbc contrib/linearize: Add feature to set file's timestamp based on block header time. (Jeff Garzik)
8f5a423 contrib/linearize: split block files based on year-month, not just year (Jeff Garzik)
75400a2 contrib/linearize: Guarantee that output is generated in-order (Jeff Garzik)
This commit is contained in:
Wladimir J. van der Laan 2014-09-04 15:19:55 +02:00
commit d800dcc32a
No known key found for this signature in database
GPG Key ID: 74810B012346C9A6
2 changed files with 52 additions and 26 deletions

View File

@ -27,6 +27,7 @@ output.
Optional config file setting for linearize-data: Optional config file setting for linearize-data:
* "netmagic": network magic number * "netmagic": network magic number
* "max_out_sz": maximum output file size (default 1000*1000*1000) * "max_out_sz": maximum output file size (default 1000*1000*1000)
* "split_year": Split files when a new year is first seen, in addition to * "split_timestamp": Split files when a new month is first seen, in addition to
reaching a maximum file size. reaching a maximum file size.
* "file_timestamp": Set each file's last-modified time to that of the
most recent block in that file.

View File

@ -10,11 +10,13 @@
import json import json
import struct import struct
import re import re
import os
import base64 import base64
import httplib import httplib
import sys import sys
import hashlib import hashlib
import datetime import datetime
import time
settings = {} settings = {}
@ -58,10 +60,12 @@ def calc_hash_str(blk_hdr):
hash_str = hash.encode('hex') hash_str = hash.encode('hex')
return hash_str return hash_str
def get_blk_year(blk_hdr): def get_blk_dt(blk_hdr):
members = struct.unpack("<I", blk_hdr[68:68+4]) members = struct.unpack("<I", blk_hdr[68:68+4])
dt = datetime.datetime.fromtimestamp(members[0]) nTime = members[0]
return dt.year dt = datetime.datetime.fromtimestamp(nTime)
dt_ym = datetime.datetime(dt.year, dt.month, 1)
return (dt_ym, nTime)
def get_block_hashes(settings): def get_block_hashes(settings):
blkindex = [] blkindex = []
@ -86,16 +90,21 @@ def copydata(settings, blkindex, blkset):
outFn = 0 outFn = 0
outsz = 0 outsz = 0
outF = None outF = None
outFname = None
blkCount = 0 blkCount = 0
lastYear = 0 lastDate = datetime.datetime(2000, 1, 1)
splitYear = False highTS = 1408893517 - 315360000
timestampSplit = False
fileOutput = True fileOutput = True
setFileTime = False
maxOutSz = settings['max_out_sz'] maxOutSz = settings['max_out_sz']
if 'output' in settings: if 'output' in settings:
fileOutput = False fileOutput = False
if settings['split_year'] != 0: if settings['file_timestamp'] != 0:
splitYear = True setFileTime = True
if settings['split_timestamp'] != 0:
timestampSplit = True
while True: while True:
if not inF: if not inF:
@ -125,36 +134,49 @@ def copydata(settings, blkindex, blkset):
print("Skipping unknown block " + hash_str) print("Skipping unknown block " + hash_str)
continue continue
if blkindex[blkCount] != hash_str:
print("Out of order block.")
print("Expected " + blkindex[blkCount])
print("Got " + hash_str)
sys.exit(1)
if not fileOutput and ((outsz + inLen) > maxOutSz): if not fileOutput and ((outsz + inLen) > maxOutSz):
outF.close() outF.close()
if setFileTime:
os.utime(outFname, (int(time.time()), highTS))
outF = None outF = None
outFname = None
outFn = outFn + 1 outFn = outFn + 1
outsz = 0 outsz = 0
if splitYear: (blkDate, blkTS) = get_blk_dt(blk_hdr)
blkYear = get_blk_year(blk_hdr) if timestampSplit and (blkDate > lastDate):
if blkYear > lastYear: print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
print("New year " + str(blkYear) + " @ " + hash_str) lastDate = blkDate
lastYear = blkYear
if outF: if outF:
outF.close() outF.close()
if setFileTime:
os.utime(outFname, (int(time.time()), highTS))
outF = None outF = None
outFname = None
outFn = outFn + 1 outFn = outFn + 1
outsz = 0 outsz = 0
if not outF: if not outF:
if fileOutput: if fileOutput:
fname = settings['output_file'] outFname = settings['output_file']
else: else:
fname = "%s/blk%05d.dat" % (settings['output'], outFn) outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
print("Output file" + fname) print("Output file" + outFname)
outF = open(fname, "wb") outF = open(outFname, "wb")
outF.write(inhdr) outF.write(inhdr)
outF.write(rawblock) outF.write(rawblock)
outsz = outsz + inLen + 8 outsz = outsz + inLen + 8
blkCount = blkCount + 1 blkCount = blkCount + 1
if blkTS > highTS:
highTS = blkTS
if (blkCount % 1000) == 0: if (blkCount % 1000) == 0:
print("Wrote " + str(blkCount) + " blocks") print("Wrote " + str(blkCount) + " blocks")
@ -184,13 +206,16 @@ if __name__ == '__main__':
settings['input'] = 'input' settings['input'] = 'input'
if 'hashlist' not in settings: if 'hashlist' not in settings:
settings['hashlist'] = 'hashlist.txt' settings['hashlist'] = 'hashlist.txt'
if 'split_year' not in settings: if 'file_timestamp' not in settings:
settings['split_year'] = 0 settings['file_timestamp'] = 0
if 'split_timestamp' not in settings:
settings['split_timestamp'] = 0
if 'max_out_sz' not in settings: if 'max_out_sz' not in settings:
settings['max_out_sz'] = 1000L * 1000 * 1000 settings['max_out_sz'] = 1000L * 1000 * 1000
settings['max_out_sz'] = long(settings['max_out_sz']) settings['max_out_sz'] = long(settings['max_out_sz'])
settings['split_year'] = int(settings['split_year']) settings['split_timestamp'] = int(settings['split_timestamp'])
settings['file_timestamp'] = int(settings['file_timestamp'])
settings['netmagic'] = settings['netmagic'].decode('hex') settings['netmagic'] = settings['netmagic'].decode('hex')
if 'output_file' not in settings and 'output' not in settings: if 'output_file' not in settings and 'output' not in settings: