package SWISH::Filters::ODF2xml; use strict; use XML::Parser; use Archive::Zip qw( :ERROR_CODES ); use vars qw/ $VERSION /; $VERSION = '0.02'; # the file must be unzipped to extract these three files: # content.xml, meta.xml, and mimetype my @parsed = (); sub new { my ( $class ) = @_; my $self = bless { mimetypes => [ qr!^application/vnd\.oasis\.opendocument\.text!, qr!^application/vnd\.oasis\.opendocument\.text-template!, qr!^application/vnd\.oasis\.opendocument\.spreadsheet!, qr!^application/vnd\.oasis\.opendocument\.spreadsheet-template!, qr!^application/vnd\.oasis\.opendocument\.presentation!, qr!^application/vnd\.oasis\.opendocument\.presentation-template!, qr!^application/vnd\.oasis\.opendocument\.text-master!, qr!^application/vnd\.oasis\.opendocument\.text-web!, qr!^application/vnd\.oasis\.opendocument\.graphics!, qr!^application/vnd\.oasis\.opendocument\.graphics-template!, qr!^application/vnd\.oasis\.opendocument\.chart!, qr!^application/vnd\.oasis\.opendocument\.chart-template!, qr!^application/vnd\.oasis\.opendocument\.image!, qr!^application/vnd\.oasis\.opendocument\.image-template!, qr!^application/vnd\.oasis\.opendocument\.formula!, qr!^application/vnd\.oasis\.opendocument\.formula-template!, ], }, $class; return $self->set_programs( 'unzip' ); } sub filter { my ( $self, $doc ) = @_; open ( OUT, ">/tmp/x"); print OUT "YO\n",`date`,"\n"; close ( OUT ); # We need a file name to pass to the conversion function my $file = $doc->fetch_filename; $doc->set_content_type( 'text/xml' ); my ( $mimetype, $metadata, $content ) = &Unzip_file( $file ); $mimetype = qq($mimetype); my $document = < $mimetype$metadata$content EOF # open ( OUT, ">>/tmp/x"); print OUT $document; close ( OUT ); return \$document; } sub Unzip_file { my ( $file ) = ( @_ ); my $zip = Archive::Zip->new(); die ('read error') unless ( $zip->read( $file ) == AZ_OK ); my $content = $zip->contents( 'content.xml' ) or die ("Failed to extract content\n"); $content = &Parse_XML ( $content, 0 ); my $metadata = $zip->contents( 'meta.xml' ) or die ("Failed to extract metadata\n"); $metadata = &Parse_XML ( $metadata, 1 ); my $mimetype = $zip->contents( 'mimetype' ) or die ("Failed to extract mimetype\n"); # open (OUT, ">>/tmp/x");print OUT qq(MIME===$mimetype\n\n);close(OUT); return ( $mimetype, $metadata, $content ); } sub Parse_XML { my ( $xml, $meta ) = @_; @parsed = (); # clear buffer my $p = XML::Parser->new(); $p->setHandlers ( Comment => sub {''}, Proc => sub {''}, Doctype => sub {''}, XMLDecl => sub {''}, End => \&End, Default => \&Default, ); $p->setHandlers(Start => \&Start_meta) if ( $meta ); $p->parse ( $xml ); return join('', @parsed); } sub Default { my $expat = shift; my $data = shift; push ( @parsed, $data ); } sub Start { shift; my $element = shift; my $el = qq(<$element); my $attribute = ''; my $value = ''; while ( $attribute = shift ) { $value = shift; $el .= qq( $attribute="$value"); } $el .= qq(>); push ( @parsed, $el ); } sub Start_meta { shift; my $element = shift; my %attr = (); my @element = (); my $attribute = ''; my $value = ''; while ( $attribute = shift ) { $value = shift; push ( @element, qq( $attribute="$value") ); $attr{$attribute} = $value; } push ( @element, qq(>) ); if ( $element ne 'meta:user-defined' ) { unshift ( @element, qq(<$element) ); # all other metadata } else { $value = lc $attr{'meta:name'}; # user-defined metadata $value =~ s/\s/-/g; $value =~ tr/-/-/s; if ( $value ) { unshift ( @element, qq(<$value) ); # } else { unshift ( @element, qq(<$element) ); # all other metadata } } push ( @parsed, join('', @element) ); } sub End { shift; my $data = shift; push (@parsed, qq()); } 1; __END__ =head1 NAME SWISH::Filters::ODF2xml - Perl extension for filtering OpenDocument v 1.0 documents with Swish-e =head1 DESCRIPTION This plug-in module uses the Archive::Zip and XML::Parser modules from CPAN to convert OpenDocument to a flat XML stream for indexing by Swish-e. Three parts (mimetype, meta.xml, and content.xml) are extracted and processed. =head2 METADATA The MetaNameAlias directive must be used in swish.conf to index the document titles and other metadata properly. e.g: MetaNameAlias swishtitle dc:title MetaNameAlias keyword meta:keyword dc:subject MetaNameAlias swishdescription dc:description Any user-defined metadata should also be mapped with MetaNameAlias if needed. For example, if field "Info 1" is renamed "DC:Creator" then it will be parsed as "dc:creator" and need to be recognized in swish.conf =head1 AUTHOR Lars Noodén =head1 SEE ALSO