I'm trying to run a perl script that uses metatagger to return metadata from a PDF and then use PDF::API2 to actually update the metadata in the PDF. However, I keep getting a seg fault after encountering certain pdfs. I've used eval statements around any of the metatagger or pdf::api2 commands but it still crashes.
If anyone could help, I would greatly appreaciate it.
Thanks.
#!/opt/Interwoven/TeamSite/iw-perl/bin/iwperl
use Data:
umper;
use File::Find;
use PDF::API2;
use XML:
imple;
use XML::Twig;
my $logFile = "/tmp/pdf-MT-all-files.log";
my $dir = "/pdfdir";
if ( ! open (LOG,">$logFile") ){
print STDERR "Failed to write to debug log file: $logFile\\nError=$!";
}
else{
*STDERR = *LOG;
*STDOUT = *LOG;
print LOG "\n\n####\n## START : " . localtime() . " ##\n######\n";
}
open FILE, $skipList;
my $skip_list_string = join("", );
find(\&directory_processing, $dir);
#print LOG "Get files:@files\n";
sub directory_processing
{
#print LOG "count:$count\n";
if($File::Find::name =~ m/.pdf$/is) {
unless($skip_list_string=~m/$File::Find::name/is){
print LOG "$File::Find::name\n";
call_metagger($File::Find::name);
}
}
}
close(LOG);
sub call_metagger {
my ($file)= @_;
my $xml = "";
my $papi = "";
my $found_meta_keyword_data = "";
my $found_meta_title_data = "";
my $found_meta_description_data = "";
my $errFlag=0;
my $hostname = `hostname`;
$hostname =~ s/\s//g;
my @data="";
#my $mt_cmd = "/opt/Interwoven/TeamSite/bin/iwmtbatch -mthost utlp5055:9095 -suggest $file -nocontext -contextURL http://$hostname:80/iw-bin/iwmtcontext.cgi -dump";
my $mt_cmd = "/opt/Interwoven/TeamSite/bin/iwmtbatch -mthost utlp5055:9095 -suggest $file -nocontext -contextURL http://$hostname:80/iw-bin/iwmtcontext.cgi -dump";
print LOG "MT cmd: $mt_cmd\n";
#print LOG "\nRunning Metatagger update...\n";
eval{
open(CMD, $mt_cmd." 2>&1 |");
@data = ;
close(CMD);
};
unless($@){
print LOG "checking for errors...\n";
$tempdata = join("",@data);
$tempdata =~ s/.*<metadata>(.*)<\/metadata>.*/<metadata>$1<\/metadata>/gs;
#print LOG "\n\nReturned data:\n$tempdata\n";
if(@data){
foreach my $data (@data){
if($data =~ /\>ERROR\<\//){
print LOG "\n\n failed: $data\n";
$errFlag = 1;
}
}
}else{
$errFlag = 1;
print LOG "\n\n failed \n";
}
}
else{
$errFlag=1;
}
if($tempdata && $errFlag != 1){
my $tempMetaTaggerResultsXml = "\n" . $tempdata;
undef @keywords;
undef $keywords;
undef @description;
undef $description;
undef @definitions;
undef $definitions;
print LOG "extracting data\n";
my $t = XML::Twig->new();
eval{
$t->parse($tempMetaTaggerResultsXml);
my $root = $t->root;
my ($nameNode) = $root->get_xpath( '/metadata/attribute/name[string()="Keywords"]');
my $attributeNode = $nameNode->parent();
foreach my $valueNode ( $attributeNode->get_xpath( 'value' )){
my $tempKeyword=$valueNode->text;
$tempKeyword=~s/[^a-zA-Z0-9 \t\r\n\v\f\-\\\/]/ /ig;
push(@keywords, $tempKeyword);
#print LOG "\nKeywords: " . $tempKeyword;
}
my ($nameNode) = $root->get_xpath( '/metadata/attribute/name[string()="Description"]');
my $attributeNode = $nameNode->parent();
foreach my $valueNode ( $attributeNode->get_xpath( 'value' )){
my $tempDescription=$valueNode->text;
$tempDescription=~s/[^a-zA-Z0-9 \t\r\n\v\f\-\\\/]/ /ig;
push(@description, $tempDescription);
#print LOG "\nDescription: " . $tempDescription;
}
my @descriptorNodes = $root->get_xpath( '/metadata/facet/descriptor');
foreach my $node ( @descriptorNodes){
my $labelNode = $node->get_xpath('label', 0);
my $tempDescriptor=$labelNode->text;
$tempDescriptor=~s/[^a-zA-Z0-9 \t\r\n\v\f\-\\\/]/ /ig;
push(@definitions, $tempDescriptor);
#print LOG "\nDefinitions: " . $tempDescriptor;
}
};
#print LOG "\nContent: \n" . $tempdata . "\n--End of content--\n";
unless($@){
foreach my $value (@description){
$description .= ", $value";
}
my $count = 0;
foreach my $value (@definitions){
if($count < 10){
$keywords .= ", $value";
}
$count++;
}
my $flag = 0;
foreach my $value (@keywords){
$flag = 0;
foreach my $key (@definitions){
if($value eq $key){
$flag = 1;
}
}
if($flag == 0){
$keywords .= ", $value";
}
}
#debug ("\nkeywords before:\n" . $keywords . "\n--End of keywords--\n");
$keywords =~ s/\<(.*?)\>/ /g;
$keywords =~ s/(\&)(.*?)(\
/ /g;
$keywords =~ s/^\s+//g;
$keywords =~ s/^\W+//g;
#debug ("\nDescription before:\n" . $description . "\n--End of Description--\n");
$description =~ s/(<)(.*?)(>)/ /g;
$description =~ s/(\&)(.*?)(\
/ /g;
$description =~ s/^\W+//g;
chomp($keywords);
chomp($description);
$keywords =~ s/(\W)/\\$1/g;
$description =~ s/(\W)/\\$1/g;
if($keywords ne "" && $description ne ""){
eval{update_pdf($keywords, $description, $file)};
if($@){print LOG "Error calling update pdf";}
}
}
else{
print LOG "Errror parsing metadata\n";
}
}
}
sub update_pdf {
my ($found_meta_keyword_data, $found_meta_description_data, $file)= @_;
my $xml = "";
my $pdf = "";
print LOG "Description:$found_meta_description_data\n";
print LOG "Keywords:$found_meta_keyword_data\n";
print LOG "opening pdf\n";
eval{$pdf=PDF::API2->open($file);};
unless($@){
eval{$xml=$pdf->xmpMetadata;};
unless($@){
#print LOG "PDFs Metadata reads: $xml\n";
if($found_meta_keyword_data ne ""){
unless($xml =~ m/.*?<\/pdf:Keywords>/is){
unless($xml=~s/<\/pdf
roducer>/<\/pdf
roducer>$found_meta_keyword_data<\/pdf:Keywords>/is){
$xml=~s/pdf
roducer=\'.*?\'/pdf
roducer=\'.*?\' pdf:Keywords=\'$found_meta_keyword_data\'/is;
}
}
}
if($found_meta_description_data ne ""){
unless($xml =~ m/.*?<\/dc:description>/is){
unless($xml=~s/<\/dc:title>/<\/dc:title>$found_meta_description_data<\/rdf:li><\/rdf:Alt><\/dc:description>/is){
$xml=~s/<\/dc:creator>/<\/dc:creator>$found_meta_description_data<\/rdf:li><\/rdf:Alt><\/dc:description>/is;
}
}
}
print LOG "about to update \n";
$pdf->xmpMetadata($xml);
$pdf->update;
#print LOG "updated meta data: $xml\n";
$file=~s/\/iwmnt//i;
my $lock_cmd = "/opt/Interwoven/TeamSite/bin/iwlock $file 'Metadata updated'";
eval{my $lock = `$lock_cmd`;};
if($@){print LOG "$file\t\tUnable to lock file\n\n";}
print LOG "$file\t\tSuccessfully updated metadata\n\n";
}
else{
print LOG "$file\t\tError in pdf metadata\n\n";
}
eval{$pdf->end;};
if($@){print LOG "$file\t\tUnable to close pdf\n\n";}
}
else{
print LOG "$file\t\tError opening File\n\n";
print LOG "$@\n";
}
}
sub ok { printf STDERR "\n\n%s\n%s\n\n", '='x60, uc( "= @_" ) }