5 * Created by Victor Grishchenko on 3/6/09.
6 * Copyright 2009-2012 TECHNISCHE UNIVERSITEIT DELFT. All rights reserved.
11 #include "bin_utils.h"
12 //#include <openssl/sha.h>
24 #define OPENFLAGS O_RDWR|O_CREAT|_O_BINARY
26 #define OPENFLAGS O_RDWR|O_CREAT
30 using namespace swift;
32 const Sha1Hash Sha1Hash::ZERO = Sha1Hash();
34 void SHA1 (const void *data, size_t length, unsigned char *hash) {
37 blk_SHA1_Update(&ctx, data, length);
38 blk_SHA1_Final(hash, &ctx);
41 Sha1Hash::Sha1Hash(const Sha1Hash& left, const Sha1Hash& right) {
44 blk_SHA1_Update(&ctx, left.bits,SIZE);
45 blk_SHA1_Update(&ctx, right.bits,SIZE);
46 blk_SHA1_Final(bits, &ctx);
49 Sha1Hash::Sha1Hash(const char* data, size_t length) {
51 length = strlen(data);
52 SHA1((unsigned char*)data,length,bits);
55 Sha1Hash::Sha1Hash(const uint8_t* data, size_t length) {
56 SHA1(data,length,bits);
59 Sha1Hash::Sha1Hash(bool hex, const char* hash) {
62 for(int i=0; i<SIZE; i++) {
63 if (sscanf(hash+i*2, "%2x", &val)!=1) {
69 assert(this->hex()==std::string(hash));
71 memcpy(bits,hash,SIZE);
74 std::string Sha1Hash::hex() const {
76 for(int i=0; i<HASHSZ; i++)
77 sprintf(hex+i*2, "%02x", (int)(unsigned char)bits[i]);
78 return std::string(hex,HASHSZ*2);
83 /** H a s h t r e e */
86 HashTree::HashTree (const char* filename, const Sha1Hash& root_hash, uint32_t chunk_size, const char* hash_filename, bool force_check_diskvshash, bool check_netwvshash, const char* binmap_filename) :
87 root_hash_(root_hash), hashes_(NULL), peak_count_(0), fd_(0), hash_fd_(0),
88 filename_(filename), size_(0), sizec_(0), complete_(0), completec_(0),
89 chunk_size_(chunk_size), check_netwvshash_(check_netwvshash)
91 fd_ = open(filename,OPENFLAGS,S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
94 print_error("cannot open the file");
100 hfn.append(".mhash");
101 hash_filename = hfn.c_str();
105 if (!binmap_filename){
106 bfn.assign(filename);
107 bfn.append(".mbinmap");
108 binmap_filename = bfn.c_str();
111 // Arno: if user doesn't want to check hashes but no .mhash, check hashes anyway
112 bool actually_force_check_diskvshash = force_check_diskvshash;
113 bool mhash_exists=true;
119 int res = stat( hash_filename, &buf );
120 if( res < 0 && errno == ENOENT)
121 mhash_exists = false;
122 if (!mhash_exists && !force_check_diskvshash)
123 actually_force_check_diskvshash = true;
126 // Arno: if the remainder of the hashtree state is on disk we can
127 // hashcheck very quickly
128 bool binmap_exists=true;
129 res = stat( binmap_filename, &buf );
130 if( res < 0 && errno == ENOENT)
131 binmap_exists = false;
133 //fprintf(stderr,"hashtree: hashchecking want %s do %s binmap-on-disk %s\n", (force_check_diskvshash ? "yes" : "no"), (actually_force_check_diskvshash? "yes" : "no"), (binmap_exists? "yes" : "no") );
135 hash_fd_ = open(hfn.c_str(),OPENFLAGS,S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
138 print_error("cannot open hash file");
142 // Arno: if user wants to or no .mhash, and if root hash unknown (new file) and no checkpoint, (re)calc root hash
143 if (file_size(fd_) && ((actually_force_check_diskvshash) || (root_hash_==Sha1Hash::ZERO && !binmap_exists) || (!mhash_exists)) ) {
144 // fresh submit, hash it
145 dprintf("%s hashtree full compute\n",tintstr());
146 assert(file_size(fd_));
148 } else if (mhash_exists && binmap_exists && file_size(hash_fd_)) {
149 // Arno: recreate hash tree without rereading content
150 dprintf("%s hashtree read from checkpoint\n",tintstr());
151 FILE *fp = fopen(binmap_filename,"rb");
153 print_error("hashtree: cannot open .mbinmap file");
156 if (deserialize(fp) < 0) {
157 // Try to rebuild hashtree data
161 // Arno: no data on disk, or mhash on disk, but no binmap. In latter
162 // case recreate binmap by reading content again. Historic optimization
164 dprintf("%s hashtree empty or partial recompute\n",tintstr());
170 HashTree::HashTree(bool dummy, const char* binmap_filename) :
171 root_hash_(Sha1Hash::ZERO), hashes_(NULL), peak_count_(0), fd_(0), hash_fd_(0),
172 filename_(""), size_(0), sizec_(0), complete_(0), completec_(0),
173 chunk_size_(0), check_netwvshash_(false)
175 FILE *fp = fopen(binmap_filename,"rb");
179 if (partial_deserialize(fp) < 0) {
185 // Reads complete file and constructs hash tree
186 void HashTree::Submit () {
187 size_ = file_size(fd_);
188 sizec_ = (size_ + chunk_size_-1) / chunk_size_;
190 //fprintf(stderr,"hashtree: submit: cs %i\n", chunk_size_);
192 peak_count_ = gen_peaks(sizec_,peaks_);
193 int hashes_size = Sha1Hash::SIZE*sizec_*2;
194 dprintf("%s hashtree submit resizing hash file\n",tintstr() );
195 file_resize(hash_fd_,hashes_size);
196 hashes_ = (Sha1Hash*) memory_map(hash_fd_,hashes_size);
198 size_ = sizec_ = complete_ = completec_ = 0;
199 print_error("mmap failed");
202 size_t last_piece_size = (sizec_ - 1) % (chunk_size_) + 1;
203 char *chunk = new char[chunk_size_];
204 for (uint64_t i=0; i<sizec_; i++) {
206 size_t rd = read(fd_,chunk,chunk_size_);
207 if (rd<(chunk_size_) && i!=sizec_-1) {
213 hashes_[pos.toUInt()] = Sha1Hash(chunk,rd);
215 while (pos.is_right()){
217 hashes_[pos.toUInt()] = Sha1Hash(hashes_[pos.left().toUInt()],hashes_[pos.right().toUInt()]);
223 for (int p=0; p<peak_count_; p++) {
224 peak_hashes_[p] = hashes_[peaks_[p].toUInt()];
227 root_hash_ = DeriveRoot();
232 /** Basically, simulated receiving every single chunk, except
233 for some optimizations.
234 Precondition: root hash known */
235 void HashTree::RecoverProgress () {
237 //fprintf(stderr,"hashtree: recover: cs %i\n", chunk_size_);
239 if (!RecoverPeakHashes())
242 // at this point, we may use mmapd hashes already
243 // so, lets verify hashes and the data we've got
244 char *zero_chunk = new char[chunk_size_];
245 memset(zero_chunk, 0, chunk_size_);
246 Sha1Hash zero_hash(zero_chunk,chunk_size_);
248 // Arno: loop over all pieces, read each from file
249 // ARNOSMPTODO: problem is that we may have the complete hashtree, but
250 // not have all pieces. So hash file gives too little information to
251 // determine whether file is complete on disk.
253 char *buf = new char[chunk_size_];
254 for(int p=0; p<size_in_chunks(); p++) {
256 if (hashes_[pos.toUInt()]==Sha1Hash::ZERO)
258 size_t rd = read(fd_,buf,chunk_size_);
259 if (rd!=(chunk_size_) && p!=size_in_chunks()-1)
261 if (rd==(chunk_size_) && !memcmp(buf, zero_chunk, rd) &&
262 hashes_[pos.toUInt()]!=zero_hash) // FIXME // Arno == don't have piece yet?
264 if (!OfferHash(pos, Sha1Hash(buf,rd)) )
269 if (rd!=(chunk_size_) && p==size_in_chunks()-1) // set the exact file size
270 size_ = ((sizec_-1)*chunk_size_) + rd;
276 /** Precondition: root hash known */
277 bool HashTree::RecoverPeakHashes()
279 uint64_t size = file_size(fd_);
280 uint64_t sizek = (size + chunk_size_-1) / chunk_size_;
282 // Arno: Calc location of peak hashes, read them from hash file and check if
283 // they match to root hash. If so, load hashes into memory.
285 int peak_count = gen_peaks(sizek,peaks);
286 for(int i=0; i<peak_count; i++) {
288 file_seek(hash_fd_,peaks[i].toUInt()*sizeof(Sha1Hash));
289 if (read(hash_fd_,&peak_hash,sizeof(Sha1Hash))!=sizeof(Sha1Hash))
291 OfferPeakHash(peaks[i], peak_hash);
294 return false; // if no valid peak hashes found
299 int HashTree::serialize(FILE *fp)
301 fprintf_retiffail(fp,"version %i\n", 1 );
302 fprintf_retiffail(fp,"root hash %s\n", root_hash_.hex().c_str() );
303 fprintf_retiffail(fp,"chunk size %lu\n", chunk_size_ );
304 fprintf_retiffail(fp,"complete %llu\n", complete_ );
305 fprintf_retiffail(fp,"completec %llu\n", completec_ );
306 return ack_out_.serialize(fp);
310 /** Arno: recreate hash tree from .mbinmap file without rereading content.
311 * Precondition: root hash known
313 int HashTree::deserialize(FILE *fp) {
314 return internal_deserialize(fp,true);
317 int HashTree::partial_deserialize(FILE *fp) {
318 return internal_deserialize(fp,false);
322 int HashTree::internal_deserialize(FILE *fp,bool contentavail) {
324 char hexhashstr[256];
329 fscanf_retiffail(fp,"version %i\n", &version );
330 fscanf_retiffail(fp,"root hash %s\n", hexhashstr);
331 fscanf_retiffail(fp,"chunk size %lu\n", &cs);
332 fscanf_retiffail(fp,"complete %llu\n", &c );
333 fscanf_retiffail(fp,"completec %llu\n", &cc );
335 //fprintf(stderr,"hashtree: deserialize: %s %llu ~ %llu * %i\n", hexhashstr, c, cc, cs );
337 if (ack_out_.deserialize(fp) < 0)
339 root_hash_ = Sha1Hash(true, hexhashstr);
342 // Arno, 2012-01-03: Hack to just get root hash
346 if (!RecoverPeakHashes()) {
347 root_hash_ = Sha1Hash::ZERO;
352 // Are reset by RecoverPeakHashes() for some reason.
355 size_ = file_size(fd_);
356 sizec_ = (size_ + chunk_size_-1) / chunk_size_;
362 bool HashTree::OfferPeakHash (bin_t pos, const Sha1Hash& hash) {
363 char bin_name_buf[32];
364 dprintf("%s hashtree offer peak %s\n",tintstr(),pos.str(bin_name_buf));
368 bin_t last_peak = peaks_[peak_count_-1];
369 if ( pos.layer()>=last_peak.layer() ||
370 pos.base_offset()!=last_peak.base_offset()+last_peak.base_length() )
373 peaks_[peak_count_] = pos;
374 peak_hashes_[peak_count_] = hash;
376 // check whether peak hash candidates add up to the root hash
377 Sha1Hash mustbe_root = DeriveRoot();
378 if (mustbe_root!=root_hash_)
380 for(int i=0; i<peak_count_; i++)
381 sizec_ += peaks_[i].base_length();
383 // bingo, we now know the file size (rounded up to a chunk_size() unit)
385 size_ = sizec_ * chunk_size_;
386 completec_ = complete_ = 0;
387 sizec_ = (size_ + chunk_size_-1) / chunk_size_;
389 // ARNOTODO: win32: this is pretty slow for ~200 MB already. Perhaps do
390 // on-demand sizing for Win32?
391 uint64_t cur_size = file_size(fd_);
392 if ( cur_size<=(sizec_-1)*chunk_size_ || cur_size>sizec_*chunk_size_ ) {
393 dprintf("%s hashtree offerpeak resizing file\n",tintstr() );
394 if (file_resize(fd_, size_)) {
395 print_error("cannot set file size\n");
396 size_=0; // remain in the 0-state
401 // mmap the hash file into memory
402 uint64_t expected_size = sizeof(Sha1Hash)*sizec_*2;
403 // Arno, 2011-10-18: on Windows we could optimize this away,
404 //CreateFileMapping, see compat.cpp will resize the file for us with
407 if ( file_size(hash_fd_) != expected_size ) {
408 dprintf("%s hashtree offerpeak resizing hash file\n",tintstr() );
409 file_resize (hash_fd_, expected_size);
412 hashes_ = (Sha1Hash*) memory_map(hash_fd_,expected_size);
414 size_ = sizec_ = complete_ = completec_ = 0;
415 print_error("mmap failed");
419 for(int i=0; i<peak_count_; i++)
420 hashes_[peaks_[i].toUInt()] = peak_hashes_[i];
422 dprintf("%s hashtree memory mapped\n",tintstr() );
428 Sha1Hash HashTree::DeriveRoot () {
430 dprintf("%s hashtree deriving root\n",tintstr() );
432 int c = peak_count_-1;
434 Sha1Hash hash = peak_hashes_[c];
436 // Arno, 2011-10-14: Root hash = top of smallest tree covering content IMHO.
437 //while (!p.is_all()) {
441 hash = Sha1Hash(hash,Sha1Hash::ZERO);
443 if (c<0 || peaks_[c]!=p.sibling())
444 return Sha1Hash::ZERO;
445 hash = Sha1Hash(peak_hashes_[c],hash);
450 //fprintf(stderr,"root bin is %lli covers %lli\n", p.toUInt(), p.base_length() );
455 /** For live streaming: appends the data, adjusts the tree.
456 @ return the number of fresh (tail) peak hashes */
457 int HashTree::AppendData (char* data, int length) {
462 bin_t HashTree::peak_for (bin_t pos) const {
464 while (pi<peak_count_ && !peaks_[pi].contains(pos))
466 return pi==peak_count_ ? bin_t(bin_t::NONE) : peaks_[pi];
469 bool HashTree::OfferHash (bin_t pos, const Sha1Hash& hash) {
470 if (!size_) // only peak hashes are accepted at this point
471 return OfferPeakHash(pos,hash);
474 if (!check_netwvshash_)
477 bin_t peak = peak_for(pos);
481 return hash == hashes_[pos.toUInt()];
482 if (!ack_out_.is_empty(pos.parent()))
483 return hash==hashes_[pos.toUInt()]; // have this hash already, even accptd data
485 // Arno: if we already verified this hash against the root, don't replace
486 if (!is_hash_verified_.is_empty(bin_t(0,pos.toUInt())))
487 return hash == hashes_[pos.toUInt()];
489 hashes_[pos.toUInt()] = hash;
491 return false; // who cares?
493 Sha1Hash uphash = hash;
494 // Arno: Note well: bin_t(0,p.toUInt()) is to abuse binmap as bitmap.
495 while ( p!=peak && ack_out_.is_empty(p) && is_hash_verified_.is_empty(bin_t(0,p.toUInt())) ) {
496 hashes_[p.toUInt()] = uphash;
498 // Arno: Prevent poisoning the tree with bad values:
499 // Left hand hashes should never be zero, and right
500 // hand hash is only zero for the last packet, i.e.,
501 // layer 0. Higher layers will never have 0 hashes
502 // as SHA1(zero+zero) != zero (but b80de5...)
504 if (hashes_[p.left().toUInt()] == Sha1Hash::ZERO || hashes_[p.right().toUInt()] == Sha1Hash::ZERO)
506 uphash = Sha1Hash(hashes_[p.left().toUInt()],hashes_[p.right().toUInt()]);
507 }// walk to the nearest proven hash
509 bool success = (uphash==hashes_[p.toUInt()]);
512 // Arno: The hash checks out. Mark all hashes on the uncle path as
513 // being verified, so we don't have to go higher than them on a next
516 // Arno: Note well: bin_t(0,p.toUInt()) is to abuse binmap as bitmap.
517 is_hash_verified_.set(bin_t(0,p.toUInt()));
518 while (p.layer() != peak.layer()) {
519 p = p.parent().sibling();
520 is_hash_verified_.set(bin_t(0,p.toUInt()));
522 // Also mark hashes on direct path to root as verified. Doesn't decrease
523 // #checks, but does increase the number of verified hashes faster.
527 is_hash_verified_.set(bin_t(0,p.toUInt()));
535 bool HashTree::OfferData (bin_t pos, const char* data, size_t length) {
540 if (length<chunk_size_ && pos!=bin_t(0,sizec_-1))
542 if (ack_out_.is_filled(pos))
543 return true; // to set data_in_
544 bin_t peak = peak_for(pos);
548 Sha1Hash data_hash(data,length);
549 if (!OfferHash(pos, data_hash)) {
550 char bin_name_buf[32];
551 // printf("invalid hash for %s: %s\n",pos.str(bin_name_buf),data_hash.hex().c_str()); // paranoid
552 //fprintf(stderr,"INVALID HASH FOR %lli layer %d\n", pos.toUInt(), pos.layer() );
553 dprintf("%s hashtree check failed (bug TODO) %s\n",tintstr(),pos.str(bin_name_buf));
557 //printf("g %lli %s\n",(uint64_t)pos,hash.hex().c_str());
559 // Arno,2011-10-03: appease g++
560 if (pwrite(fd_,data,length,pos.base_offset()*chunk_size_) < 0)
561 print_error("pwrite failed");
564 if (pos.base_offset()==sizec_-1) {
565 size_ = ((sizec_-1)*chunk_size_) + length;
566 if (file_size(fd_)!=size_)
567 file_resize(fd_,size_);
573 uint64_t HashTree::seq_complete () {
574 uint64_t seqc = ack_out_.find_empty().base_offset();
578 return seqc*chunk_size_;
581 HashTree::~HashTree () {
583 memory_unmap(hash_fd_, hashes_, sizec_*2*sizeof(Sha1Hash));