diff --git a/bin/audit/truenas_audit.pl b/bin/audit/truenas_audit.pl new file mode 100755 index 00000000..8eff7f91 --- /dev/null +++ b/bin/audit/truenas_audit.pl @@ -0,0 +1,281 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Data::Dumper; +use DBI; +use File::Basename; +use File::Pairtree qw(ppath2id s2ppchars); +use FindBin; +use POSIX qw(strftime); +use Getopt::Long; +use URI::Escape; + +use lib "$FindBin::Bin/../../lib"; +use HTFeed::Config qw(get_config); +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Log {root_logger => 'INFO, screen'}; +use HTFeed::METS; +use HTFeed::Namespace; +use HTFeed::PackageType; +use HTFeed::RepositoryIterator; +use HTFeed::Volume; +use HTFeed::VolumeValidator; + +my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; + +my $insert = +"insert into feed_storage (namespace, id, storage_name, zip_size, mets_size, lastchecked) values(?,?,?,?,?,CURRENT_TIMESTAMP) \ +ON DUPLICATE KEY UPDATE zip_size=?, mets_size=?, lastchecked = CURRENT_TIMESTAMP"; + +my $update = +"update feed_storage set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ? and storage_name = ?"; + +my $insert_detail = +"insert into feed_audit_detail (namespace, id, storage_name, path, status, detail) values (?,?,?,?,?,?)"; + +my $checkpoint_sel = +"select lastmd5check > ? from feed_storage where namespace = ? and id = ?"; + +### set /sdr1 to /sdrX for test & parallelization + +my $do_md5 = 0; +my $checkpoint = undef; +my $noop = undef; +my $storage_name = undef; +GetOptions( + 'md5!' => \$do_md5, + 'checkpoint=s' => \$checkpoint, + 'noop' => \$noop, + 'storage_name=s' => \$storage_name, +); + +# $storage_name must be one of 's3-truenas-ictc', 's3-truenas-macc' +if (!defined $storage_name) { + die '--storage_name is required'; +} +if ($storage_name ne 's3-truenas-macc' && $storage_name ne 's3-truenas-ictc') { + die "--storage_name must have value of 's3-truenas-macc' or 's3-truenas-ictc"; +} + +my $base = shift @ARGV or die("Missing base directory.."); +my $iterator = HTFeed::RepositoryIterator->new($base); +my $sdr_partition = $iterator->{sdr_partition}; + +while (my $obj = $iterator->next_object) { + my $path = $obj->{path}; + my $namespace = $obj->{namespace}; + my $objid = $obj->{objid}; + eval { + if ($obj->{directory_objid} ne $objid) { + set_status( $namespace, $objid, $storage_name, $path, "BAD_PAIRTREE", + "$objid $obj->{directory_objid}" ); + } + + #get last modified date + my $zipfile = "$obj->{path}/$obj->{objid}.zip"; + my $zip_seconds; + my $zipdate; + my $zipsize; + + if ( -e $zipfile ) { + $zip_seconds = ( stat($zipfile) )[9]; + $zipdate = strftime( "%Y-%m-%d %H:%M:%S", localtime($zip_seconds) ); + $zipsize = -s $zipfile; + } + + my $metsfile = "$obj->{path}/$obj->{objid}.mets.xml"; + + my $mets_seconds; + my $metsdate; + my $metssize; + + if ( -e $metsfile ) { + $mets_seconds = ( stat($metsfile) )[9]; + $metssize = -s $metsfile; + $metsdate = strftime( "%Y-%m-%d %H:%M:%S", + localtime( ( stat($metsfile) )[9] ) ); + } + + my $last_touched = $zip_seconds; + $last_touched = $mets_seconds if defined $mets_seconds and (not defined $zip_seconds or $mets_seconds > $zip_seconds); + + # test symlinks unless we're traversing sdr1 + if ( $sdr_partition ne '1' ) { + my $volume = new HTFeed::Volume( + packagetype => "pkgtype", + namespace => $namespace, + objid => $objid + ); + my $link_path = $path; + $link_path =~ s/sdr$sdr_partition/sdr1/; + my $link_target = readlink $link_path + or set_status( $namespace, $objid, $storage_name, $path, "CANT_LSTAT", + "$link_path $!" ); + + if ( defined $link_target and $link_target ne $path ) { + set_status( $namespace, $objid, $storage_name, $path, "SYMLINK_INVALID", + $link_target || '' ); + } + + } + + + #insert + execute_stmt( + $insert, + $namespace, $objid, $storage_name, + $zipsize, $metssize, + # duplicate parameters for duplicate key update + $zipsize, $metssize, + ); + + # does barcode have a zip & xml, and do they match? + + my $filecount = 0; + my $found_zip = 0; + my $found_mets = 0; + foreach my $file (@{$obj->{contents}}) { + next if $file =~ /pre_uplift.mets.xml$/; # ignore backup mets + if ( $file !~ /^([^.]+)\.(zip|mets.xml)$/ ) { + set_status($namespace, $objid, $storage_name, $path, "BAD_FILE", "$file"); + next; + } + my $dir_barcode = $1; + my $ext = $2; + $found_zip++ if $ext eq 'zip'; + $found_mets++ if $ext eq 'mets.xml'; + if ($objid ne $dir_barcode) { + set_status($namespace, $objid, $storage_name, $path, "BARCODE_MISMATCH", "$objid $dir_barcode"); + } + $filecount++; + } + + # check file count; do md5 check and METS extraction stuff + if (defined $zip_seconds || defined $mets_seconds) { + if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid) ) or $found_mets != 1 ) { + set_status( $namespace, $objid, $storage_name, $path, "BAD_FILECOUNT", + "zip=$found_zip mets=$found_mets total=$filecount" ); + } + + eval { + my $rval = zipcheck( $namespace, $objid, $storage_name ); + if ($rval) { + execute_stmt( $update, "1", $namespace, $objid, $storage_name ); + } + elsif ( defined $rval ) { + execute_stmt( $update, "0", $namespace, $objid, $storage_name ); + } + }; + if ($@) { + set_status( $namespace, $objid, $storage_name, $path, "CANT_ZIPCHECK", $@ ); + } + } + + }; + + if ($@) { + warn($@); + } +} + +get_dbh()->disconnect(); +$iterator->close; + +sub zipcheck { + my ( $namespace, $objid, $storage_name ) = @_; + + return unless $do_md5; + + return if is_tombstoned($namespace, $objid); + + # don't check this item if we just looked at it + if(defined $checkpoint) { + my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid); + if(my @row = $sth->fetchrow_array()) { + return if @row and $row[0]; + } + } + + # use google as a 'default' namespace for now + my $volume = new HTFeed::Volume( + packagetype => "pkgtype", + namespace => $namespace, + objid => $objid + ); + my $mets = $volume->get_repository_mets_xpc(); + my $rval = undef; + +# Extract the checksum for the zip file that looks kind of like this: +# +# +# +# +# + + if ($do_md5) { + my $zipname = $volume->get_zip_filename(); + my $mets_zipsum = $mets->findvalue( + "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); + + if(not defined $mets_zipsum or length($mets_zipsum) ne 32) { + # zip name may be uri-escaped in some cases + $zipname = uri_escape($zipname); + $mets_zipsum = $mets->findvalue( + "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); + } + + if ( not defined $mets_zipsum or length($mets_zipsum) ne 32 ) { + set_status( $namespace, $objid, $storage_name, + $volume->get_repository_mets_path(), + "MISSING_METS_CHECKSUM", undef ); + } + else { + my $realsum = HTFeed::VolumeValidator::md5sum( + $volume->get_repository_zip_path() ); + if ( $mets_zipsum eq $realsum ) { + print "$zipname OK\n"; + $rval = 1; + } + else { + set_status( $namespace, $objid, $storage_name, + $volume->get_repository_zip_path(), + "BAD_CHECKSUM", "expected=$mets_zipsum actual=$realsum" ); + $rval = 0; + } + } + } + return $rval; +} + +sub set_status { + warn( join( " ", @_ ), "\n" ); + execute_stmt( $insert_detail, @_ ); +} + +sub execute_stmt { + my $stmt = shift; + + # Bail out if noop and the SQL statement is mutating, SELECT is okay + return if $noop and ($stmt =~ /^insert|update/); + + my $dbh = get_dbh(); + my $sth = $dbh->prepare($stmt); + $sth->execute(@_); + return $sth; +} + +# There are as of early 2026 still 13 is_tombstoned entries in feed_audit, so this check stays. +sub is_tombstoned { + my $namespace = shift; + my $objid = shift; + my $sth = execute_stmt($tombstone_check,$namespace,$objid); + if(my @row = $sth->fetchrow_array()) { + return $row[0]; + } else { + return 0; + } +} + +__END__ diff --git a/etc/ingest.sql b/etc/ingest.sql index 17e95127..6551e41e 100644 --- a/etc/ingest.sql +++ b/etc/ingest.sql @@ -3,6 +3,7 @@ USE `ht`; CREATE TABLE IF NOT EXISTS `feed_audit` ( `namespace` varchar(10) NOT NULL, `id` varchar(30) NOT NULL, + `storage_name` varchar(32) NOT NULL, `sdr_partition` tinyint(4) DEFAULT NULL, `zip_size` bigint(20) DEFAULT NULL, `image_size` bigint(20) DEFAULT NULL, @@ -14,7 +15,7 @@ CREATE TABLE IF NOT EXISTS `feed_audit` ( `lastmd5check` timestamp NULL DEFAULT NULL, `md5check_ok` tinyint(1) DEFAULT NULL, `is_tombstoned` tinyint(1) DEFAULT NULL, - PRIMARY KEY (`namespace`,`id`), + PRIMARY KEY (`namespace`,`id`,`storage_name`), KEY `feed_audit_zip_date_idx` (`zip_date`) ); @@ -144,11 +145,12 @@ CREATE TABLE IF NOT EXISTS `feed_storage` ( CREATE TABLE IF NOT EXISTS `feed_audit_detail` ( `namespace` varchar(10) NOT NULL, `id` varchar(30) NOT NULL, + `storage_name` varchar(32) NOT NULL, `path` varchar(255) DEFAULT NULL, `status` varchar(30) DEFAULT NULL, `detail` tinytext, `time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - KEY `fs_log_status_objid_idx` (`namespace`,`id`) + KEY `fs_log_status_objid_idx` (`namespace`,`id`,`storage_name`) ); USE `ht`; diff --git a/lib/HTFeed/RepositoryIterator.pm b/lib/HTFeed/RepositoryIterator.pm new file mode 100644 index 00000000..e2e18c25 --- /dev/null +++ b/lib/HTFeed/RepositoryIterator.pm @@ -0,0 +1,136 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +#use FindBin; +#use lib "$FindBin::Bin/../../lib"; + +use File::Basename; +use File::Pairtree qw(ppath2id s2ppchars); + +package HTFeed::RepositoryIterator; + +# The only restriction on `path` is that it must have a component ending with `sdrX` +# where X is one or more digits +sub new { + my $class = shift; + my $path = shift; + + # Remove trailing slash from path if necessary + $path =~ s!/$!!; + my @pathcomp = split('/', $path); + # remove base & any empty components + #@pathcomp = grep { $_ ne '' } @pathcomp; + my $sdr_partition = undef; + if ($path =~ qr#/?sdr(\d+)/?#) { + $sdr_partition = $1; + } else { + die "Cannot infer SDR partition from $path"; + } + my $self = { + # The path to traverse. May be a subpath like /tmp/sdr1/obj/test + path => $path, + sdr_partition => $sdr_partition, + objects_processed => 0, + }; + bless($self, $class); + return $self; +} + +sub next_object { + my $self = shift; + + my $obj = undef; + while (1) { + my $line = readline($self->_find_pipe); + last unless defined $line; + chomp $line; + # Pairtree stuff + next if $line =~ /pairtree_prefix$/; + # ignore temporary location + next if $line =~ qr(obj/\.tmp); + #next if $line =~ /\Qpre_uplift.mets.xml\E/; + #next if $self->_recent_previous_version($line); + + my ($file_objid, $path, $type) = File::Basename::fileparse($line); + # Remove trailing slash + $path =~ s!/$!!; + next if $self->{prev_path} and $path eq $self->{prev_path}; + + $self->{objects_processed}++; + $self->{prev_path} = $path; + + # Remove everything up to and including the `sdrX/` + my $subpath = $path; + $subpath =~ s!.*?sdr\d+/!!; + my @pathcomp = split('/', $subpath); + @pathcomp = grep { $_ ne '' } @pathcomp; + my $namespace = $pathcomp[1]; + my $directory_objid = $pathcomp[-1]; + my $objid = File::Pairtree::ppath2id(join('/', @pathcomp)); + $obj = { + path => $path, + namespace => $namespace, + # Caller should make sure objid and directory_objid are equivalent, + # and also that objid matches the contents + objid => $objid, + directory_objid => $directory_objid, + contents => $self->_contents($path), + }; + last; + } + return $obj; +} + +sub close { + my $self = shift; + + if ($self->{find_pipe}) { + close $self->{find_pipe}; + $self->{find_pipe} = undef; + } +} + +# Returns a sorted arrayref with filenames (not full paths) in +# an object directory. Excludes . and .. +sub _contents { + my $self = shift; + my $path = shift; + + my @contents; + opendir(my $dh, $path); + while ( my $file = readdir($dh) ) { + next if $file eq '.' or $file eq '..'; + push(@contents, $file); + } + @contents = sort @contents; + return \@contents; +} + +sub _find_pipe { + my $self = shift; + + if (!$self->{find_pipe}) { + my $find_pipe; + my $find_cmd = "find $self->{path} -follow -type f|"; + open($find_pipe, $find_cmd) or die("Can't open pipe to find: $!"); + $self->{find_pipe} = $find_pipe; + } + return $self->{find_pipe}; +} + +# NOTE: is this needed? +# Does file end with `.old` suffix and is it less than 48 hours old? +sub _recent_previous_version { + my $self = shift; + my $file = shift; + + if ($file =~ /.old$/) { + my $ctime = ( stat($file) )[10]; + my $ctime_age = time() - $ctime; + return 1 if $ctime_age < (86400 * 2); + } +} + +1; diff --git a/lib/HTFeed/Storage/LocalPairtree.pm b/lib/HTFeed/Storage/LocalPairtree.pm index 7308fcda..4be518a2 100644 --- a/lib/HTFeed/Storage/LocalPairtree.pm +++ b/lib/HTFeed/Storage/LocalPairtree.pm @@ -138,19 +138,21 @@ sub record_audit { my ($sdr_partition) = ($path =~ qr#/?sdr(\d+)/?#); my $stmt = - "insert into feed_audit (namespace, id, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked, lastmd5check, md5check_ok) \ - values(?,?,?,?,?,?,?,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,1) \ + "insert into feed_audit (namespace, id, storage_name, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked, lastmd5check, md5check_ok) \ + values(?,?,?,?,?,?,?,?,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,1) \ ON DUPLICATE KEY UPDATE sdr_partition = ?, zip_size=?, zip_date =?,mets_size=?,mets_date=?,lastchecked = CURRENT_TIMESTAMP,lastmd5check = CURRENT_TIMESTAMP, md5check_ok = 1"; # TODO populate image_size, page_count + return unless defined $self->{name}; + my $zipsize = $self->zip_size; my $zipdate = $self->file_date($self->zip_obj_path); my $metssize = $self->mets_size; my $metsdate = $self->file_date($self->mets_obj_path); my $sth = get_dbh()->prepare($stmt); my $res = $sth->execute( - $self->{namespace}, $self->{objid}, + $self->{namespace}, $self->{objid}, $self->{name}, $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, # duplicate parameters for duplicate key update $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate diff --git a/lib/HTFeed/StorageAudit.pm b/lib/HTFeed/StorageAudit.pm index 1bb9f48f..a5a8db21 100644 --- a/lib/HTFeed/StorageAudit.pm +++ b/lib/HTFeed/StorageAudit.pm @@ -397,9 +397,9 @@ sub record_error { my $status = shift @$err; my %details = @$err; my $detail = join "\t", map { "$_: $details{$_}"; } keys %details; - my $sql = 'INSERT INTO feed_audit_detail (namespace, id, path, status, detail)'. - ' VALUES (?,?,?,?,?)'; - execute_stmt($sql, $obj->{namespace}, $obj->{objid}, $obj->{path}, $status, $detail); + my $sql = 'INSERT INTO feed_audit_detail (namespace, id, storage_name, path, status, detail)'. + ' VALUES (?,?,?,?,?,?)'; + execute_stmt($sql, $obj->{namespace}, $obj->{objid}, $self->{storage_name}, $obj->{path}, $status, $detail); } # ==== UTILITY CLASS METHOD ==== diff --git a/t/repository_iterator.t b/t/repository_iterator.t new file mode 100644 index 00000000..1db6fd60 --- /dev/null +++ b/t/repository_iterator.t @@ -0,0 +1,88 @@ +use strict; +use warnings; + +use File::Copy; +use File::Pairtree qw(id2ppath s2ppchars); +use File::Path; + +use Test::Spec; +use HTFeed::RepositoryIterator; + +describe "HTFeed::RepositoryIterator" => sub { + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog); + + sub make_sdr_entry { + my $namespace = shift; + my $objid = shift; + + my $pt_objid = s2ppchars($objid); + my $pt_path = id2ppath($objid); + my $full_path = "/tmp/sdr1/obj/$namespace/$pt_path" . $pt_objid; + File::Path::make_path($full_path); + `touch $full_path/$pt_objid.mets.xml`; + `touch $full_path/$pt_objid.zip`; + } + + before all => sub { + my $namespace = 'test'; + my $objid = 'test'; + make_sdr_entry('ns1', 'objid1'); + make_sdr_entry('ns2', 'objid2'); + }; + + after all => sub { + File::Path::remove_tree('/tmp/sdr1'); + }; + + describe 'new' => sub { + it "creates an object that exposes the expected data" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1'); + is($iterator->{path}, '/tmp/sdr1', 'it has the path we gave it'); + is($iterator->{sdr_partition}, 1, 'it has sdr partition of 1 from sdr1'); + }; + }; + + describe 'next_object' => sub { + it "returns an object with the expected data" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1'); + my @objects; + my $object = $iterator->next_object; + is($object->{path}, '/tmp/sdr1/obj/ns1/pairtree_root/ob/ji/d1/objid1', 'path to the terminal directory'); + is($object->{namespace}, 'ns1', 'namespace `test` from path'); + is($object->{objid}, 'objid1', 'objid `objid1` from pairtree'); + is($object->{directory_objid}, 'objid1', 'directory_objid `objid1` from terminal directory name'); + is_deeply($object->{contents}, ['objid1.mets.xml','objid1.zip'], '.mets.xml and .zip contents'); + is($iterator->{objects_processed}, 1, 'it has processed 1 object'); + + }; + + it "returns two objects" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1'); + while ($iterator->next_object) { } + is($iterator->{objects_processed}, 2, 'it has processed 2 objects'); + }; + + describe 'with a subdirectory' => sub { + it "returns an object with the expected data" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1/obj/ns1/'); + my @objects; + my $object = $iterator->next_object; + is($object->{path}, '/tmp/sdr1/obj/ns1/pairtree_root/ob/ji/d1/objid1', 'path to the terminal directory'); + is($object->{namespace}, 'ns1', 'namespace `ns1` from path'); + is($object->{objid}, 'objid1', 'objid `objid1` from pairtree'); + is($object->{directory_objid}, 'objid1', 'directory_objid `objid1` from terminal directory name'); + is_deeply($object->{contents}, ['objid1.mets.xml','objid1.zip'], '.mets.xml and .zip contents'); + is($iterator->{objects_processed}, 1, 'it has processed 1 file'); + }; + + it "returns only one object" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1/obj/ns1/'); + while ($iterator->next_object) { } + is($iterator->{objects_processed}, 1, 'it has processed 1 object'); + }; + }; + }; +}; + +runtests unless caller; diff --git a/t/truenas_audit.t b/t/truenas_audit.t new file mode 100644 index 00000000..ad13b64c --- /dev/null +++ b/t/truenas_audit.t @@ -0,0 +1,269 @@ +use strict; +use warnings; + +use Data::Dumper; +use File::Copy; +use File::Pairtree qw(id2ppath s2ppchars); +use File::Spec; +use Test::Spec; + +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Storage::LocalPairtree; + +describe "bin/audit/main_repo_audit.pl" => sub { + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog); + + sub local_storage { + my $volume = stage_volume($tmpdirs,@_); + + my $storage = HTFeed::Storage::LocalPairtree->new( + name => 'localpairtree-test', + volume => $volume, + config => { + obj_dir => $tmpdirs->{obj_dir} + } + ); + return $storage; + } + + # Returns the data as arrayref of hashref + sub get_feed_storage_data { + my $namespace = shift; + my $objid = shift; + my $storage_name = shift; + + my $data = []; + my $sql = 'SELECT * FROM feed_storage WHERE namespace=? AND id=? AND storage_name=?'; + my $sth = get_dbh()->prepare($sql); + $sth->execute($namespace, $objid, $storage_name); + while (my $row = $sth->fetchrow_hashref) { + push(@$data, $row); + } + return $data; + } + + # Returns the data as arrayref of hashref + sub get_feed_audit_detail_data { + my $namespace = shift; + my $objid = shift; + my $storage_name = shift; + + my $data = []; + my $sql = 'SELECT * FROM feed_audit_detail WHERE namespace=? AND id=? AND storage_name=?'; + my $sth = get_dbh()->prepare($sql); + $sth->execute($namespace, $objid, $storage_name); + while (my $row = $sth->fetchrow_hashref) { + push(@$data, $row); + } + return $data; + } + + # `RepositoryIterator` can infer its sdr partition when it isn't at the root of the + # filesystem but it does need an "sdrX" directory _somewhere_ in the path. We can't use + # `$tmpdirs->{obj_dir}` by itself. + sub temp_sdr_path { + my $sdr_partition = shift || 1; + + return File::Spec->catfile($tmpdirs->{tmpdir}, "sdr$sdr_partition"); + } + + sub temp_sdr_obj_path { + my $sdr_partition = shift || 1; + my $namespace = shift || 'test'; + my $objid = shift || 'test'; + + return File::Spec->catfile( + temp_sdr_path($sdr_partition), + 'obj', + $namespace, + id2ppath($objid), + s2ppchars($objid) + ); + } + + sub temp_link_path { + my $namespace = shift || 'test'; + my $objid = shift || 'test'; + + return File::Spec->catfile( + File::Spec->rootdir, + 'tmp', + 'obj_link', + $namespace, + id2ppath($objid), + s2ppchars($objid) + ); + } + + # Set up sdr1 and sdr2 directories with the appropriate linkage from latter to former. + # Copy contents from `$tempdirs->{obj_dir}` into a local sdr2 so `RepositoryIterator` has + # the proprioceptive stimulus (i.e., a directory named "sdr2" somewhere in the path) it needs. + sub make_test_directories { + my $namespace = shift; + my $objid = shift; + my $sdr2_path = temp_sdr_path(2); + my $sdr1_obj_path = temp_sdr_obj_path(1); + my $sdr2_obj_path = temp_sdr_obj_path(2); + my $temp_link_path = temp_link_path; + + File::Path::make_path("$sdr2_obj_path"); + system("cp -r $tmpdirs->{obj_dir}/* $sdr2_path/obj/"); + # Symlink into obj_link so Volume.pm can find the files, + # and into sdr1 for symlink checks inside truenas_audit.pl + # Create directory structures but remove the leaf node so we can recreate it as a symlink. + # This is kind of silly but trying to create a partial path would be messier. + File::Path::make_path($temp_link_path); + File::Path::remove_tree($temp_link_path); + File::Path::make_path($sdr1_obj_path); + File::Path::remove_tree($sdr1_obj_path); + system("ln -sf $sdr2_obj_path $temp_link_path"); + system("ln -sf $sdr2_obj_path $sdr1_obj_path"); + } + + before each => sub { + my $namespace = 'test'; + my $objid = 'test'; + my $storage = local_storage($namespace, $objid); + $storage->stage; + $storage->make_object_path; + $storage->move; + make_test_directories($namespace, $objid); + }; + + after each => sub { + File::Path::remove_tree(temp_sdr_path); + File::Path::remove_tree(temp_sdr_path(2)); + File::Path::remove_tree('/tmp/obj_link'); + get_dbh->prepare('DELETE FROM feed_storage')->execute; + get_dbh->prepare('DELETE FROM feed_audit_detail')->execute; + }; + + foreach my $storage_name (('s3-truenas-macc', 's3-truenas-ictc')) { + it "writes to feed_storage" => sub { + my $temp_sdr_path = temp_sdr_path; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one initial entry'); + is($db_data->[0]->{namespace}, 'test', 'correct namespace'); + is($db_data->[0]->{id}, 'test', 'correct id'); + is($db_data->[0]->{storage_name}, $storage_name, 'correct storage_name'); + ok($db_data->[0]->{zip_size} > 0, 'nonzero zip_size'); + ok($db_data->[0]->{mets_size} > 0, 'nonzero mets_size'); + ok(!defined $db_data->[0]->{saved_md5sum}, 'not defined saved_md5sum'); + ok(defined $db_data->[0]->{deposit_time}, 'defined deposit_time'); + ok(defined $db_data->[0]->{lastchecked}, 'defined lastchecked'); + ok(defined $db_data->[0]->{lastmd5check}, 'defined lastmd5check'); + is($db_data->[0]->{md5check_ok}, 1, 'md5check_ok=1'); + }; + } + + # If existing data, only `lastchecked` and `lastmd5check` will change + # (file sizes will also be updated but with the same data). + it "updates existing data" => sub { + my $temp_sdr_path = temp_sdr_path; + my $storage_name = 's3-truenas-macc'; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one initial entry'); + my $old_lastchecked = $db_data->[0]->{lastchecked}; + my $old_lastmd5check = $db_data->[0]->{lastmd5check}; + sleep 1; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); + $db_data = get_feed_storage_data('test', 'test', $storage_name); + my $new_lastchecked = $db_data->[0]->{lastchecked}; + my $new_lastmd5check = $db_data->[0]->{lastmd5check}; + is(scalar(@$db_data), 1, 'with only one final entry'); + isnt($old_lastchecked, $new_lastchecked, 'with changed `lastchecked`'); + isnt($old_lastmd5check, $new_lastmd5check, 'with changed `lastmd5check`'); + }; + + it "records a failed MD5 check" => sub { + my $temp_sdr_path = temp_sdr_path; + my $storage_name = 's3-truenas-macc'; + my $objid = 'test'; + # Replace the zip with garbage + my $zip_path = File::Spec->catfile(temp_sdr_obj_path, "$objid.zip"); + open(my $fh, '>', $zip_path) or die "open zip file $zip_path failed: $!"; + print $fh "shwoozle\n"; + close($fh); + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one initial feed_storage entry'); + ok(defined $db_data->[0]->{lastchecked}, 'defined lastchecked'); + ok(defined $db_data->[0]->{lastmd5check}, 'defined lastmd5check'); + is($db_data->[0]->{md5check_ok}, 0, 'md5check_ok=0'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 1, 'with one feed_audit_detail entry'); + is($detail_data->[0]->{namespace}, 'test', 'feed_audit_detail namespace'); + is($detail_data->[0]->{id}, 'test', 'feed_audit_detail id'); + is($detail_data->[0]->{storage_name}, $storage_name, 'feed_audit_detail storage_name'); + # The path for these examples is via the symlink, so it will be different from the $zip_path we fiddled with + ok($detail_data->[0]->{path} =~ /\.zip$/, 'feed_audit_detail path'); + is($detail_data->[0]->{status}, 'BAD_CHECKSUM', 'feed_audit_detail status'); + ok($detail_data->[0]->{detail} =~ /expected=/, 'feed_audit_detail detail'); + ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); + }; + + it "records a spurious file but ignores pre-uplift METS" => sub { + my $temp_sdr_path = temp_sdr_path; + my $storage_name = 's3-truenas-macc'; + my $objid = 'test'; + # Add a silly file and a pre-uplift file (can be empty, contents don't matter) + foreach my $ext (('silly', 'pre_uplift.mets.xml')) { + my $path = File::Spec->catfile(temp_sdr_obj_path, "$objid.$ext"); + system("touch $path"); + } + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one feed_storage entry'); + is($db_data->[0]->{md5check_ok}, 1, 'md5check_ok=1'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 1, 'with one feed_audit_detail entry'); + is($detail_data->[0]->{namespace}, 'test', 'feed_audit_detail namespace'); + is($detail_data->[0]->{id}, 'test', 'feed_audit_detail id'); + is($detail_data->[0]->{storage_name}, $storage_name, 'feed_audit_detail storage_name'); + ok(defined $detail_data->[0]->{path}, 'feed_audit_detail path defined'); + is($detail_data->[0]->{status}, 'BAD_FILE', 'feed_audit_detail status'); + ok($detail_data->[0]->{detail} =~ /silly/, 'feed_audit_detail detail'); + ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); + }; + + # For symlink checks we use sdr2 so the symlinks in sdr1 can be verified to point to + # the right place in sdr2. + it "checks symlinks" => sub { + my $temp_sdr_path = temp_sdr_path(2); + my $storage_name = 's3-truenas-macc'; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with feed_storage entry'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 0, 'with no feed_audit_detail entries'); + }; + + it "detects bad symlinks" => sub { + my $temp_sdr_path = temp_sdr_path(2); + my $storage_name = 's3-truenas-macc'; + + # Remove the symlink on sdr1 and replace it with a link to somewhere else + my $sdr1_link_location = temp_sdr_obj_path; + # "Somewhere else" is /dev/null + # Create a symlink clobbering the existing one without following it + system("ln -sfn /dev/null $sdr1_link_location"); + + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with feed_storage entry'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 1, 'with one feed_audit_detail entry'); + is($detail_data->[0]->{namespace}, 'test', 'feed_audit_detail namespace'); + is($detail_data->[0]->{id}, 'test', 'feed_audit_detail id'); + is($detail_data->[0]->{storage_name}, $storage_name, 'feed_audit_detail storage_name'); + ok($detail_data->[0]->{path} =~ /sdr2/, 'feed_audit_detail path implicates sdr2'); + is($detail_data->[0]->{status}, 'SYMLINK_INVALID', 'feed_audit_detail status'); + ok($detail_data->[0]->{detail} =~ /null/, 'feed_audit_detail detail'); + ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); + }; +}; + +runtests unless caller;