<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<nutch-conf>

    <property>
	<name>plugin.includes</name>
	<value>nutch-extensionpoints|protocol-(file|pmid)|urlfilter-regex|parse-(text|html|pubmed)|index-basic|query-(basic|site|url)</value>
	<description>Regular expression naming plugin directory names to
	    include.  Any plugin not matching this expression is excluded.
	    In any case you need at least include the nutch-extensionpoints plugin. By
	    default Nutch includes crawling just HTML and plain text via HTTP,
	    and basic indexing and search plugins.
	</description>
    </property>
    <property>
	<name>urlnormalizer.class</name>
	<value>org.apache.nutch.net.TrivialUrlNormalizer</value>
	<description>Name of the class used to normalize URLs.</description>
    </property>
    <property>
	<name>protocol.pmc.colorizer</name>
	<value>http://oriole.eecs.umich.edu/cgi-bin/colorizeHtmlText.pl</value>
	<description>Path to PMC abstract/citations colorizer</description>
    </property>
    <property>
	<name>protocol.pmid.pubmed_url</name>
	<value>http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?cmd=Retrieve&amp;db=pubmed&amp;dopt=Abstract&amp;query_hl=1&amp;itool=pubmed_docsum</value>
	<description>URL to PubMed Entrez search utility. URLs for protocol-pmid should be of this form, with &amp;list_uids=$PMID on the end.</description>
    </property>
    <property>
	<name>db.max.anchor.length</name>
	<value>65536</value>
	<description>The maximum number of characters permitted in an anchor.
	</description>
    </property>
    <property>
	<name>db.ignore.internal.links</name>
	<value>false</value>
	<description>If true, when adding new links to a page, links from
	    the same host are ignored.  This is an effective way to limit the
	    size of the link database, keeping the only the highest quality
	    links.
	</description>
    </property>
</nutch-conf>
