-
Notifications
You must be signed in to change notification settings - Fork 0
ETT-1288 Run fixity check on new storage #175
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
moseshll
wants to merge
15
commits into
main
Choose a base branch
from
ETT-1288_fixity
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+786
−8
Open
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
876a2e5
ETT-1288 Run fixity check on new storage
moseshll 412d376
Remove stale comment
moseshll d7770dc
Restore unnecesarily changed regex for sdr_partition
moseshll 318cbc6
Remove unused includes in t/main_repo_audit.t
moseshll cf920f6
Add RepositoryIterator class
moseshll 3aeeb60
Ignore pairtree_prefix file
moseshll 7c0d2a3
- Revert main_repo_audit.pl
moseshll 9334243
Remove no longer needed `opendir` in truenas_audit.pl
moseshll 6f5c0f6
Spurous file test just needs `touch` without worrying about writing f…
moseshll 33f290a
Add comment and remove FIXME on question about checking `feed_audit.i…
moseshll 06a29c7
Put testing sdrX directory under `$tmpdirs->{tmpdir}` instead of `/tm…
moseshll b279f29
- Enable symlink checks outside sdr1, and add tests for same
moseshll fad9af7
Remove two unused subroutines from main_repo_audit but unneeded here
moseshll 431c359
Replace backticks with system()
moseshll 43bf649
Update record_audit logic to bail out if storage name is not set
moseshll File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,281 @@ | ||
| #!/usr/bin/perl | ||
|
|
||
| use strict; | ||
| use warnings; | ||
|
|
||
| use Data::Dumper; | ||
| use DBI; | ||
| use File::Basename; | ||
| use File::Pairtree qw(ppath2id s2ppchars); | ||
| use FindBin; | ||
| use POSIX qw(strftime); | ||
| use Getopt::Long; | ||
| use URI::Escape; | ||
|
|
||
| use lib "$FindBin::Bin/../../lib"; | ||
| use HTFeed::Config qw(get_config); | ||
| use HTFeed::DBTools qw(get_dbh); | ||
| use HTFeed::Log {root_logger => 'INFO, screen'}; | ||
| use HTFeed::METS; | ||
| use HTFeed::Namespace; | ||
| use HTFeed::PackageType; | ||
| use HTFeed::RepositoryIterator; | ||
| use HTFeed::Volume; | ||
| use HTFeed::VolumeValidator; | ||
|
|
||
| my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; | ||
|
|
||
| my $insert = | ||
| "insert into feed_storage (namespace, id, storage_name, zip_size, mets_size, lastchecked) values(?,?,?,?,?,CURRENT_TIMESTAMP) \ | ||
| ON DUPLICATE KEY UPDATE zip_size=?, mets_size=?, lastchecked = CURRENT_TIMESTAMP"; | ||
|
|
||
| my $update = | ||
| "update feed_storage set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ? and storage_name = ?"; | ||
|
|
||
| my $insert_detail = | ||
| "insert into feed_audit_detail (namespace, id, storage_name, path, status, detail) values (?,?,?,?,?,?)"; | ||
|
|
||
| my $checkpoint_sel = | ||
| "select lastmd5check > ? from feed_storage where namespace = ? and id = ?"; | ||
|
|
||
| ### set /sdr1 to /sdrX for test & parallelization | ||
|
|
||
| my $do_md5 = 0; | ||
| my $checkpoint = undef; | ||
| my $noop = undef; | ||
| my $storage_name = undef; | ||
| GetOptions( | ||
| 'md5!' => \$do_md5, | ||
| 'checkpoint=s' => \$checkpoint, | ||
| 'noop' => \$noop, | ||
| 'storage_name=s' => \$storage_name, | ||
| ); | ||
|
|
||
| # $storage_name must be one of 's3-truenas-ictc', 's3-truenas-macc' | ||
| if (!defined $storage_name) { | ||
| die '--storage_name is required'; | ||
| } | ||
| if ($storage_name ne 's3-truenas-macc' && $storage_name ne 's3-truenas-ictc') { | ||
| die "--storage_name must have value of 's3-truenas-macc' or 's3-truenas-ictc"; | ||
| } | ||
|
|
||
| my $base = shift @ARGV or die("Missing base directory.."); | ||
| my $iterator = HTFeed::RepositoryIterator->new($base); | ||
| my $sdr_partition = $iterator->{sdr_partition}; | ||
|
|
||
| while (my $obj = $iterator->next_object) { | ||
| my $path = $obj->{path}; | ||
| my $namespace = $obj->{namespace}; | ||
| my $objid = $obj->{objid}; | ||
| eval { | ||
| if ($obj->{directory_objid} ne $objid) { | ||
| set_status( $namespace, $objid, $storage_name, $path, "BAD_PAIRTREE", | ||
| "$objid $obj->{directory_objid}" ); | ||
| } | ||
|
|
||
| #get last modified date | ||
| my $zipfile = "$obj->{path}/$obj->{objid}.zip"; | ||
| my $zip_seconds; | ||
| my $zipdate; | ||
| my $zipsize; | ||
|
|
||
| if ( -e $zipfile ) { | ||
| $zip_seconds = ( stat($zipfile) )[9]; | ||
| $zipdate = strftime( "%Y-%m-%d %H:%M:%S", localtime($zip_seconds) ); | ||
| $zipsize = -s $zipfile; | ||
| } | ||
|
|
||
| my $metsfile = "$obj->{path}/$obj->{objid}.mets.xml"; | ||
|
|
||
| my $mets_seconds; | ||
| my $metsdate; | ||
| my $metssize; | ||
|
|
||
| if ( -e $metsfile ) { | ||
| $mets_seconds = ( stat($metsfile) )[9]; | ||
| $metssize = -s $metsfile; | ||
| $metsdate = strftime( "%Y-%m-%d %H:%M:%S", | ||
| localtime( ( stat($metsfile) )[9] ) ); | ||
| } | ||
|
|
||
| my $last_touched = $zip_seconds; | ||
| $last_touched = $mets_seconds if defined $mets_seconds and (not defined $zip_seconds or $mets_seconds > $zip_seconds); | ||
|
|
||
| # test symlinks unless we're traversing sdr1 | ||
| if ( $sdr_partition ne '1' ) { | ||
| my $volume = new HTFeed::Volume( | ||
| packagetype => "pkgtype", | ||
| namespace => $namespace, | ||
| objid => $objid | ||
| ); | ||
| my $link_path = $path; | ||
| $link_path =~ s/sdr$sdr_partition/sdr1/; | ||
| my $link_target = readlink $link_path | ||
| or set_status( $namespace, $objid, $storage_name, $path, "CANT_LSTAT", | ||
| "$link_path $!" ); | ||
|
|
||
| if ( defined $link_target and $link_target ne $path ) { | ||
| set_status( $namespace, $objid, $storage_name, $path, "SYMLINK_INVALID", | ||
| $link_target || '<undef>' ); | ||
| } | ||
|
|
||
| } | ||
|
|
||
|
|
||
| #insert | ||
| execute_stmt( | ||
| $insert, | ||
| $namespace, $objid, $storage_name, | ||
| $zipsize, $metssize, | ||
| # duplicate parameters for duplicate key update | ||
| $zipsize, $metssize, | ||
| ); | ||
|
|
||
| # does barcode have a zip & xml, and do they match? | ||
|
|
||
| my $filecount = 0; | ||
| my $found_zip = 0; | ||
| my $found_mets = 0; | ||
| foreach my $file (@{$obj->{contents}}) { | ||
| next if $file =~ /pre_uplift.mets.xml$/; # ignore backup mets | ||
| if ( $file !~ /^([^.]+)\.(zip|mets.xml)$/ ) { | ||
| set_status($namespace, $objid, $storage_name, $path, "BAD_FILE", "$file"); | ||
| next; | ||
| } | ||
| my $dir_barcode = $1; | ||
| my $ext = $2; | ||
| $found_zip++ if $ext eq 'zip'; | ||
| $found_mets++ if $ext eq 'mets.xml'; | ||
| if ($objid ne $dir_barcode) { | ||
| set_status($namespace, $objid, $storage_name, $path, "BARCODE_MISMATCH", "$objid $dir_barcode"); | ||
| } | ||
| $filecount++; | ||
| } | ||
|
|
||
| # check file count; do md5 check and METS extraction stuff | ||
| if (defined $zip_seconds || defined $mets_seconds) { | ||
| if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid) ) or $found_mets != 1 ) { | ||
| set_status( $namespace, $objid, $storage_name, $path, "BAD_FILECOUNT", | ||
| "zip=$found_zip mets=$found_mets total=$filecount" ); | ||
| } | ||
|
|
||
| eval { | ||
| my $rval = zipcheck( $namespace, $objid, $storage_name ); | ||
| if ($rval) { | ||
| execute_stmt( $update, "1", $namespace, $objid, $storage_name ); | ||
| } | ||
| elsif ( defined $rval ) { | ||
| execute_stmt( $update, "0", $namespace, $objid, $storage_name ); | ||
| } | ||
| }; | ||
| if ($@) { | ||
| set_status( $namespace, $objid, $storage_name, $path, "CANT_ZIPCHECK", $@ ); | ||
| } | ||
| } | ||
|
|
||
| }; | ||
|
|
||
| if ($@) { | ||
| warn($@); | ||
| } | ||
| } | ||
|
|
||
| get_dbh()->disconnect(); | ||
| $iterator->close; | ||
|
|
||
| sub zipcheck { | ||
| my ( $namespace, $objid, $storage_name ) = @_; | ||
|
|
||
| return unless $do_md5; | ||
|
|
||
| return if is_tombstoned($namespace, $objid); | ||
|
|
||
| # don't check this item if we just looked at it | ||
| if(defined $checkpoint) { | ||
| my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid); | ||
| if(my @row = $sth->fetchrow_array()) { | ||
| return if @row and $row[0]; | ||
| } | ||
| } | ||
|
|
||
| # use google as a 'default' namespace for now | ||
| my $volume = new HTFeed::Volume( | ||
| packagetype => "pkgtype", | ||
| namespace => $namespace, | ||
| objid => $objid | ||
| ); | ||
| my $mets = $volume->get_repository_mets_xpc(); | ||
| my $rval = undef; | ||
|
|
||
| # Extract the checksum for the zip file that looks kind of like this: | ||
| # <METS:fileGrp ID="FG1" USE="zip archive"> | ||
| # <METS:file ID="ZIP00000001" MIMETYPE="application/zip" SEQ="00000001" CREATED="2008-11-22T20:07:28" SIZE="30844759" CHECKSUM="42417b735ae73a3e16d1cca59c7fac08" CHECKSUMTYPE="MD5"> | ||
| # <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="39015603581748.zip" /> | ||
| # </METS:file> | ||
| # </METS:fileGrp> | ||
|
|
||
| if ($do_md5) { | ||
| my $zipname = $volume->get_zip_filename(); | ||
| my $mets_zipsum = $mets->findvalue( | ||
| "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); | ||
|
|
||
| if(not defined $mets_zipsum or length($mets_zipsum) ne 32) { | ||
| # zip name may be uri-escaped in some cases | ||
| $zipname = uri_escape($zipname); | ||
| $mets_zipsum = $mets->findvalue( | ||
| "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); | ||
| } | ||
|
|
||
| if ( not defined $mets_zipsum or length($mets_zipsum) ne 32 ) { | ||
| set_status( $namespace, $objid, $storage_name, | ||
| $volume->get_repository_mets_path(), | ||
| "MISSING_METS_CHECKSUM", undef ); | ||
| } | ||
| else { | ||
| my $realsum = HTFeed::VolumeValidator::md5sum( | ||
| $volume->get_repository_zip_path() ); | ||
| if ( $mets_zipsum eq $realsum ) { | ||
| print "$zipname OK\n"; | ||
| $rval = 1; | ||
| } | ||
| else { | ||
| set_status( $namespace, $objid, $storage_name, | ||
| $volume->get_repository_zip_path(), | ||
| "BAD_CHECKSUM", "expected=$mets_zipsum actual=$realsum" ); | ||
| $rval = 0; | ||
| } | ||
| } | ||
| } | ||
| return $rval; | ||
| } | ||
|
|
||
| sub set_status { | ||
| warn( join( " ", @_ ), "\n" ); | ||
| execute_stmt( $insert_detail, @_ ); | ||
| } | ||
|
|
||
| sub execute_stmt { | ||
| my $stmt = shift; | ||
|
|
||
| # Bail out if noop and the SQL statement is mutating, SELECT is okay | ||
| return if $noop and ($stmt =~ /^insert|update/); | ||
|
|
||
| my $dbh = get_dbh(); | ||
| my $sth = $dbh->prepare($stmt); | ||
| $sth->execute(@_); | ||
| return $sth; | ||
| } | ||
|
|
||
| # There are as of early 2026 still 13 is_tombstoned entries in feed_audit, so this check stays. | ||
| sub is_tombstoned { | ||
| my $namespace = shift; | ||
| my $objid = shift; | ||
| my $sth = execute_stmt($tombstone_check,$namespace,$objid); | ||
| if(my @row = $sth->fetchrow_array()) { | ||
| return $row[0]; | ||
| } else { | ||
| return 0; | ||
| } | ||
| } | ||
|
|
||
| __END__ | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.