2009年6月30日 星期二

取得pdf檔内文

基本上是利用模块CAM::PDF,以其提供的范例修改

#!/usr/bin/perl -w

package main;

use warnings;
use strict;
use CAM::PDF;
use Getopt::Long;
use Pod::Usage;

our $VERSION = '1.52';
my $lastx=0;
my $lasty=0;

my %opts = (
check => 0,
geom => 0,
verbose => 0,
help => 0,
version => 0,
);

Getopt::Long::Configure('bundling');
GetOptions('g|geometry' => \$opts{geom},
'c|check' => \$opts{check},
'v|verbose' => \$opts{verbose},
'h|help' => \$opts{help},
'V|version' => \$opts{version},
) or pod2usage(1);
if ($opts{help})
{
pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($opts{version})
{
print "CAM::PDF v$CAM::PDF::VERSION\n";
exit 0;
}

if (@ARGV < 1)
{
pod2usage(1);
}

my $file = shift;
my $pagelist = shift;
my $outfile = shift;
my $doc = CAM::PDF->new($file) || die "$CAM::PDF::errstr\n";
open (FILE, ">$outfile");
open (FILE2, ">my$outfile");
foreach my $p ($doc->rangeToArray(1,$doc->numPages(),$pagelist))
{
if ($opts{check})
{
print "Checking page $p\n";
my $tree = $doc->getPageContentTree($p, $opts{verbose});
if (!$tree || !$tree->validate())
{
print " Failed\n";
}
else{
# print $tree;
$tree->render("CAM::PDF::Renderer::Dump");
# $tree->render("CAM::PDF::Renderer::Text");
}
if ($opts{geom})
{
$tree->computeGS();
}
}
else
{
my $str = $doc->getPageContent($p, $opts{verbose});
if (defined $str)
{
# CAM::PDF->asciify(\$str);
print FILE $str;
}
my $pagetree = $doc->getPageContentTree($p);
if (!$pagetree)
{
next;
}
$str=render($pagetree);
if (defined $str)
{
$str=~s/\. \./\.\./g;
$str=~s/^[ ]+$//mg;
$str=~s/\n+/\n/g;
$str=~s/\n.+\n//; #remove the first two lines
$str=~s/\n.\n//;
$str=~s/([a-zA-Z])\n/$1/mg;
$str=~s/\.+([0-9]+)/\($1\)/g;
$str=~s/^(I|II|III|IV|V|VI|VII|VIII|IX|x)([a-z]?)\./ $1$2\./mg;
$str=~s/^([A-Z])\./ $1\./mg;
$str=~s/^([0-9]+)\./ $1\./mg;
# $str=~s/\.+//g;
print FILE2 $str;
}
}
}

close FILE;
close FILE2;

sub _TJ
{
my $str = shift;
my $args_ref = shift;

if (@{$args_ref} != 1 || $args_ref->[0]->{type} ne 'array')
{
die 'Bad TJ';
}

$str =~ s/ (\S) \z /$1 /xms;
foreach my $node (@{$args_ref->[0]->{value}})
{
if ($node->{type} eq 'string' || $node->{type} eq 'hexstring')
{
$str .= $node->{value};
}
elsif ($node->{type} eq 'number')
{
# Heuristic:
# "offset of more than a quarter unit forward"
# means significant positive spacing
if ($node->{value} < -250)
{
$str =~ s/ (\S) \z /$1 /xms;
}
}
}
return $str;
}

sub _Tj
{
my $str = shift;
my $args_ref = shift;

if (@{$args_ref} < 1 ||
($args_ref->[-1]->{type} ne 'string' && $args_ref->[-1]->{type} ne 'hexstring'))
{
die 'Bad Tj';
}

$str =~ s/ (\S) \z /$1 /xms;

return $str . $args_ref->[-1]->{value};
}

sub _Tquote
{
my $str = shift;
my $args_ref = shift;

if (@{$args_ref} < 1 ||
($args_ref->[-1]->{type} ne 'string' && $args_ref->[-1]->{type} ne 'hexstring'))
{
die 'Bad Tquote';
}

$str =~ s/ [ ]* \z /\n/xms;

return $str . $args_ref->[-1]->{value};
}

sub _Td
{
my $str = shift;
my $args_ref = shift;

if (@{$args_ref} != 2 ||
$args_ref->[0]->{type} ne 'number' ||
$args_ref->[1]->{type} ne 'number')
{
die 'Bad Td/TD';
}

# Heuristic:
# "move down in Y, and Y motion a large fraction of the X motion"
# means new line
if ($args_ref->[1]->{value} < 0
# &&
# 2 * (abs $args_ref->[1]->{value}) > abs $args_ref->[0]->{value}
)
{
$str =~ s/ [ ]* \z /\n/xms;
}

return $str;
}

sub _Tm
{
my $str = shift;
my $args_ref = shift;

if (@{$args_ref} != 6 ||
$args_ref->[5]->{type} ne 'number' ||
$args_ref->[4]->{type} ne 'number')
{
die 'Bad Td/TD';
}
if ($lasty!=$args_ref->[5]->{value}){
$str=$str."\n";
}
$lastx=$args_ref->[4]->{value};
$lasty=$args_ref->[5]->{value};
# Heuristic:
# "move down in Y, and Y motion a large fraction of the X motion"
# means new line
# if ($args_ref->[1]->{value} < 0 &&
# 2 * (abs $args_ref->[1]->{value}) > abs $args_ref->[0]->{value})
# {
# $str =~ s/ [ ]* \z /\n/xms;
# }

# return $str."(".$args_ref->[4]->{value}."/".$args_ref->[5]->{value}.")";
return $str;
}

sub _Tstar
{
my $str = shift;

$str =~ s/ [ ]* \z /\n/xms;

return $str;
}
sub render
{
# my $pkg = shift;
my $pagetree = shift;
my $verbose = shift;

my $str = q{};
my @stack = ([@{$pagetree->{blocks}}]);
my $in_textblock = 0;

## The stack is a list of blocks. We do depth-first on blocks, but
## we must be sure to traverse the children of the blocks in their
## original order.

while (@stack > 0)
{
# keep grabbing the same node until it's empty
my $node = $stack[-1];
if (ref $node)
{
if (@{$node} > 0) # Still has children?
{
my $block = shift @{$node}; # grab the next child
if ($block->{type} eq 'block')
{
if ($block->{name} eq 'BT')
{
# Insert a flag on the stack to say when we leave the BT block
push @stack, 'BT';
$in_textblock = 1;
}
push @stack, [@{$block->{value}}]; # descend
}
elsif ($in_textblock)
{
if ($block->{type} ne 'op')
{
die 'misconception';
}
my @args = @{$block->{args}};

$str = $block->{name} eq 'TJ' ? _TJ( $str, \@args )
: $block->{name} eq 'Tj' ? _Tj( $str, \@args )
: $block->{name} eq q{\'} ? _Tquote( $str, \@args )
: $block->{name} eq q{\"} ? _Tquote( $str, \@args )
: $block->{name} eq 'Td' ? _Td( $str, \@args )
: $block->{name} eq 'TD' ? _Td( $str, \@args )
: $block->{name} eq 'Tm' ? _Tm( $str, \@args )
: $block->{name} eq 'T*' ? _Tstar( $str )
: $str;
}
}
else
{
# Node is now empty, clear it from the stack
pop @stack;
}
}
else
{
# This is the 'BT' flag we pushed on the stack above
pop @stack;
$in_textblock = 0;

# Add a line break to divide the text
$str =~ s/ [ ]* \z /\n/xms;
}
}
return $str;
}

沒有留言:

張貼留言