package SWISH::Filters::ODF2xml;
use strict;
use XML::Parser;
use Archive::Zip qw( :ERROR_CODES );
use vars qw/ $VERSION /;
$VERSION = '0.02';
# the file must be unzipped to extract these three files:
# content.xml, meta.xml, and mimetype
my @parsed = ();
sub new {
my ( $class ) = @_;
my $self = bless {
mimetypes => [
qr!^application/vnd\.oasis\.opendocument\.text!,
qr!^application/vnd\.oasis\.opendocument\.text-template!,
qr!^application/vnd\.oasis\.opendocument\.spreadsheet!,
qr!^application/vnd\.oasis\.opendocument\.spreadsheet-template!,
qr!^application/vnd\.oasis\.opendocument\.presentation!,
qr!^application/vnd\.oasis\.opendocument\.presentation-template!,
qr!^application/vnd\.oasis\.opendocument\.text-master!,
qr!^application/vnd\.oasis\.opendocument\.text-web!,
qr!^application/vnd\.oasis\.opendocument\.graphics!,
qr!^application/vnd\.oasis\.opendocument\.graphics-template!,
qr!^application/vnd\.oasis\.opendocument\.chart!,
qr!^application/vnd\.oasis\.opendocument\.chart-template!,
qr!^application/vnd\.oasis\.opendocument\.image!,
qr!^application/vnd\.oasis\.opendocument\.image-template!,
qr!^application/vnd\.oasis\.opendocument\.formula!,
qr!^application/vnd\.oasis\.opendocument\.formula-template!,
],
}, $class;
return $self->set_programs( 'unzip' );
}
sub filter {
my ( $self, $doc ) = @_;
open ( OUT, ">/tmp/x"); print OUT "YO\n",`date`,"\n"; close ( OUT );
# We need a file name to pass to the conversion function
my $file = $doc->fetch_filename;
$doc->set_content_type( 'text/xml' );
my ( $mimetype, $metadata, $content ) = &Unzip_file( $file );
$mimetype = qq($mimetype);
my $document = <
$mimetype$metadata$content
EOF
# open ( OUT, ">>/tmp/x"); print OUT $document; close ( OUT );
return \$document;
}
sub Unzip_file {
my ( $file ) = ( @_ );
my $zip = Archive::Zip->new();
die ('read error') unless ( $zip->read( $file ) == AZ_OK );
my $content = $zip->contents( 'content.xml' )
or die ("Failed to extract content\n");
$content = &Parse_XML ( $content, 0 );
my $metadata = $zip->contents( 'meta.xml' ) or
die ("Failed to extract metadata\n");
$metadata = &Parse_XML ( $metadata, 1 );
my $mimetype = $zip->contents( 'mimetype' ) or
die ("Failed to extract mimetype\n");
# open (OUT, ">>/tmp/x");print OUT qq(MIME===$mimetype\n\n);close(OUT);
return ( $mimetype, $metadata, $content );
}
sub Parse_XML {
my ( $xml, $meta ) = @_;
@parsed = (); # clear buffer
my $p = XML::Parser->new();
$p->setHandlers (
Comment => sub {''},
Proc => sub {''},
Doctype => sub {''},
XMLDecl => sub {''},
End => \&End,
Default => \&Default,
);
$p->setHandlers(Start => \&Start_meta) if ( $meta );
$p->parse ( $xml );
return join('', @parsed);
}
sub Default {
my $expat = shift; my $data = shift;
push ( @parsed, $data );
}
sub Start {
shift; my $element = shift;
my $el = qq(<$element);
my $attribute = ''; my $value = '';
while ( $attribute = shift ) {
$value = shift;
$el .= qq( $attribute="$value");
}
$el .= qq(>);
push ( @parsed, $el );
}
sub Start_meta {
shift; my $element = shift;
my %attr = ();
my @element = ();
my $attribute = ''; my $value = '';
while ( $attribute = shift ) {
$value = shift;
push ( @element, qq( $attribute="$value") );
$attr{$attribute} = $value;
}
push ( @element, qq(>) );
if ( $element ne 'meta:user-defined' ) {
unshift ( @element, qq(<$element) ); # all other metadata
} else {
$value = lc $attr{'meta:name'}; # user-defined metadata
$value =~ s/\s/-/g;
$value =~ tr/-/-/s;
if ( $value ) {
unshift ( @element, qq(<$value) ); #
} else {
unshift ( @element, qq(<$element) ); # all other metadata
}
}
push ( @parsed, join('', @element) );
}
sub End {
shift; my $data = shift;
push (@parsed, qq($data>));
}
1;
__END__
=head1 NAME
SWISH::Filters::ODF2xml - Perl extension for filtering OpenDocument v 1.0 documents with Swish-e
=head1 DESCRIPTION
This plug-in module uses the Archive::Zip and XML::Parser modules from CPAN to convert OpenDocument to a flat XML stream for indexing by Swish-e.
Three parts (mimetype, meta.xml, and content.xml) are extracted and processed.
=head2 METADATA
The MetaNameAlias directive must be used in swish.conf to index the
document titles and other metadata properly. e.g:
MetaNameAlias swishtitle dc:title
MetaNameAlias keyword meta:keyword dc:subject
MetaNameAlias swishdescription dc:description
Any user-defined metadata should also be mapped with MetaNameAlias if needed.
For example, if field "Info 1" is renamed "DC:Creator" then it will be parsed as "dc:creator" and need to be recognized in swish.conf
=head1 AUTHOR
Lars Noodén
=head1 SEE ALSO