diff -Naur harvester/add.php harvester.new/add.php --- harvester/add.php 2004-02-22 17:50:03.000000000 -0700 +++ harvester.new/add.php 2005-11-11 15:30:49.000000000 -0700 @@ -76,10 +76,8 @@ query("INSERT INTO $dbtable[archives] (name, repositoryname, email, url, oai, protocol, description, rst, datestamp) VALUES ('$vars[archive_name]', '$vars[repositoryname]', '$vars[admin_email]', '$vars[archive_url]', '$vars[archive_oai]', '$vars[protocol_version]', '$vars[archive_description]', '$vars[archive_rst]', NOW())"); - + + // get archive's id + $result = $db->query("SELECT id FROM $dbtable[archives] WHERE oai='$vars[archive_oai]'"); + $items = $db->assoc_array($result); + + $sets = split("\n", $vars[archive_sets]); + foreach($sets as $set) { + $set = trim($set); + if(!empty($set) && $set != "") { + $db->query("INSERT INTO $dbtable[archives_sets] (archive,setspec) VALUES ($items[id], '".$set."')"); + } + } + if(!empty($config['add_notify'])) { // send notification email to admin mail($config['contact_email'], $config['short_title'] . " - Archive submitted", "The archive ".sanitize_output($vars[archive_name])." has been submitted to the " . $config['short_title'] . ". Please login at {$baseurl}admin.php to index this archive.", "From: " . $config['contact_email']); @@ -193,6 +200,12 @@
+ OAI archive sets: +
+ OAI sets to archive. One SET per line. Leave blank for all sets
+
+ +
Description:
Brief description of the website and its scholarly resources.
diff -Naur harvester/admin.php harvester.new/admin.php --- harvester/admin.php 2004-02-22 17:50:03.000000000 -0700 +++ harvester.new/admin.php 2005-11-11 15:30:57.000000000 -0700 @@ -61,7 +61,6 @@ exit; } - // check if user is logged in if($_SESSION['logged_in'] !== true) { include("include/header.inc.php"); @@ -257,6 +256,12 @@
+ OAI archive sets: +
+ OAI sets to archive. One SET per line. Leave blank for all sets
+
+ +
Description:
Brief description of the website and its scholarly resources.
@@ -311,7 +316,7 @@ query("INSERT INTO $dbtable[archives] (name, repositoryname, email, url, oai, protocol, description, rst, index_method, datestamp) VALUES ('$vars[archive_name]', '$vars[repositoryname]', '$vars[admin_email]', '$vars[archive_url]', '$vars[archive_oai]', '$vars[protocol_version]', '$vars[archive_description]', '$vars[archive_rst]', '$vars[index_method]', NOW())"); - + // get archive's id $result = $db->query("SELECT id FROM $dbtable[archives] WHERE oai='$vars[archive_oai]'"); $items = $db->assoc_array($result); - + + $sets = split("\n", $vars[archive_sets]); + foreach($sets as $set) { + $set = trim($set); + if(!empty($set) && $set != "") { + $db->query("INSERT INTO $dbtable[archives_sets] (archive,setspec) VALUES ($items[id], '".$set."')"); + } + } ?> The archive has been successfully added.

@@ -399,6 +411,9 @@ if(!empty($confirm)) { // delete the archive $db->query("DELETE FROM $dbtable[archives] WHERE id='$archive'"); + + // delete any archive sets + $db->query("DELETE FROM $dbtable[archives_sets] WHERE archive='$archive'"); // delete any indexed items from this archive $db->query("DELETE FROM $dbtable[metadata] WHERE archive='$archive'"); @@ -432,6 +447,18 @@ $archive = (int) $archive; + // get archive sets + $sets = ""; + $result = $db->query("SELECT setspec FROM $dbtable[archives_sets] WHERE archive='$archive'"); + + while($set = $db->assoc_array($result)) { + $sets .= $set[setspec] . "\n"; + } + + if(!empty($sets)) { + $sets = substr($sets,0,strlen($sets)-1); + } + // get archive info from database $result = $db->query("SELECT * FROM $dbtable[archives] WHERE id='$archive'"); if($db->num_rows($result) == 0) { @@ -478,6 +505,12 @@
+ OAI archive sets: +
+ OAI sets to archive. One SET per line. Leave blank for all sets
+
+ +
OAI protocol version:
@@ -544,11 +577,19 @@ // prepare variables for adding to database $vars[archive_id] = (int) $vars[archive_id]; - list($vars[archive_name], $vars[repositoryname], $vars[admin_email], $vars[archive_url], $vars[archive_oai], $vars[protocol_version], $vars[archive_description], $vars[archive_rst], $vars[index_method]) = sanitize_db_input($vars[archive_name], $vars[repositoryname], $vars[admin_email], $vars[archive_url], $vars[archive_oai], $vars[protocol_version], $vars[archive_description], $vars[archive_rst], $vars[index_method]); + list($vars[archive_name], $vars[repositoryname], $vars[admin_email], $vars[archive_url], $vars[archive_oai], $vars[archive_sets], $vars[protocol_version], $vars[archive_description], $vars[archive_rst], $vars[index_method]) = sanitize_db_input($vars[archive_name], $vars[repositoryname], $vars[admin_email], $vars[archive_url], $vars[archive_oai], $vars[archive_sets], $vars[protocol_version], $vars[archive_description], $vars[archive_rst], $vars[index_method]); $vars[archive_rst] = preg_replace('/[^\w\-]/i', '', $vars[archive_rst]); $db->query("UPDATE $dbtable[archives] SET name='$vars[archive_name]', repositoryname='$vars[repositoryname]', email='$vars[archive_email]', url='$vars[archive_url]', oai='$vars[archive_oai]', protocol='$vars[archive_protocol]', description='$vars[archive_description]', rst='$vars[archive_rst]', index_method='$vars[index_method]' WHERE id='$vars[archive_id]'"); - + $db->query("DELETE FROM $dbtable[archives_sets] WHERE archive=$vars[archive_id]"); + + $sets = split("\n", $vars[archive_sets]); + foreach($sets as $set) { + $set = trim($set); + if(!empty($set) && $set != "") { + $db->query("INSERT INTO $dbtable[archives_sets] (archive,setspec) VALUES ($vars[archive_id], '".$set."')"); + } + } ?> The archive has been successfully updated.

@@ -567,6 +608,18 @@

query("SELECT setspec FROM $dbtable[archives_sets] WHERE archive='$archive'"); + + while($set = $db->assoc_array($result)) { + $sets .= $set[setspec] . "
"; + } + + if(!empty($sets)) { + $sets = substr($sets,0,strlen($sets)-4); + } + // get archive info from database $result = $db->query("SELECT * FROM $dbtable[archives] WHERE id='$archive'"); if($db->num_rows($result) == 0) { @@ -606,6 +659,11 @@
+ OAI archive sets: + +
+ +
OAI protocol version:
diff -Naur harvester/include/db.inc.php harvester.new/include/db.inc.php --- harvester/include/db.inc.php 2004-02-22 17:50:03.000000000 -0700 +++ harvester.new/include/db.inc.php 2005-11-11 15:31:08.000000000 -0700 @@ -36,22 +36,23 @@ $db_config['type'] = "mysql"; // Name of database -$db_config['name'] = "harvester"; +$db_config['name'] = "pkp"; // Hostname, port, username, and password to database server $db_config['host'] = "localhost"; $db_config['port'] = ""; -$db_config['uname'] = "root"; -$db_config['password'] = ""; +$db_config['uname'] = "pkp"; +$db_config['password'] = "copy*support"; // Set to 1 to enable persistent connections, 0 to disable -$db_config['pconnect'] = 1; +$db_config['pconnect'] = 0; // database tables $dbtable = array(); $dbtable['harvester_config'] = "harvester_config"; $dbtable['archives'] = "archives"; +$dbtable['archives_sets'] = "archives_sets"; $dbtable['metadata'] = "metadata"; $dbtable['links'] = "links"; $dbtable['oai_resumption_tokens'] = "oai_resumption_tokens"; diff -Naur harvester/include/harvester.inc.php harvester.new/include/harvester.inc.php --- harvester/include/harvester.inc.php 2004-02-22 17:50:03.000000000 -0700 +++ harvester.new/include/harvester.inc.php 2005-11-11 15:31:15.000000000 -0700 @@ -5,19 +5,19 @@ // PKP OAI Harvester // Copyright (c) 2003-2004 The Public Knowledge Project // http://www.pkp.ubc.ca -// +// // This file is part of the PKP OAI Harvester. -// +// // The PKP OAI Harvester is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. -// +// // The PKP OAI Harvester is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. -// +// // You should have received a copy of the GNU General Public License // along with the PKP OAI Harvester; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -32,63 +32,65 @@ // index papers from a specific archive, or all archives if $archive = 0 function indexArchives($archive = 0) { global $db, $dbtable; - + // make sure we don't timeout too early set_time_limit(1200); - + // get archive info from database - $sql = "SELECT * FROM $dbtable[archives] "; - if(!empty($archive)) { $sql .= "WHERE id='$archive' "; } + $sql = "SELECT a.*, s.setspec FROM $dbtable[archives] as a \n"; + $sql .= "LEFT OUTER JOIN $dbtable[archives_sets] as s \n"; + $sql .= "ON a.id = s.archive \n"; + if(!empty($archive)) { $sql .= "WHERE a.id='$archive' \n"; } $sql .= "ORDER BY LOWER(name)"; - + $result = $db->query($sql); - + // loop through each archive for($i=0; $i<$db->num_rows($result); $i++) { $archive_items = $db->assoc_array($result, $i); $num_records = 0; - - echo "Indexing $archive_items[name] ... "; - + + echo "Indexing $archive_items[name]"; + echo ((empty($archive_items[setspec])) ? "" : " - $archive_items[setspec]"); + echo " ... "; + // determine which protocol version archive uses if(preg_match("/^2/", $archive_items[protocol])) { // OAI 2.0 $protocol = 2; - } else { // OAI 1.1 $protocol = 1; } - + // fetch records via OAI ListIdentifiers or ListRecords functions // loop until a resumption token is not returned $resumptiontoken = ""; do { if($archive_items[index_method] == "I") { - list($oai_records, $resumptiontoken) = parseListIdentifiers($archive_items[oai], $protocol, $resumptiontoken); + list($oai_records, $resumptiontoken) = parseListIdentifiers($archive_items[oai], $protocol, $archive_items[setspec], $resumptiontoken); } else { - list($oai_records, $resumptiontoken) = parseListRecords($archive_items[oai], $protocol, $resumptiontoken); + list($oai_records, $resumptiontoken) = parseListRecords($archive_items[oai], $protocol, $archive_items[setspec], $resumptiontoken); } - - // add records to database + // add records to database for($j=0; $j $v) { $metadata[$k] = sanitize_db_input(utf8_decode($v)); } - + // check if this item has already been indexed $record_result = $db->query("SELECT id, datestamp FROM $dbtable[metadata] WHERE oai_identifier='$metadata[oai_identifier]' AND archive='$archive_items[id]'"); - + if($db->num_rows($record_result) != 0) { // record exists, check if datestamp is newer $record_items = $db->assoc_array($record_result); @@ -133,31 +135,30 @@ } } } while($resumptiontoken != ""); - - + // update datestamp and record count for archive $count_result = $db->query("SELECT COUNT(*) FROM $dbtable[metadata] WHERE archive='$archive_items[id]'"); list($total_records) = $db->assoc_array($count_result); $db->query("UPDATE $dbtable[archives] SET num_records='$total_records', datestamp=NOW() WHERE id='$archive_items[id]'"); - + echo "done ($num_records new/updated record"; if($num_records != 1) { echo "s"; } echo " indexed)

\n"; } - + } // return metadata information from OAI record function formatMetadata(&$record, $protocol) { $metadata = array(); - + // return false if record is not valid if(!isset($record[header][identifier])) { return false; } - + if($protocol == 2) { // OAI 2.0 $dc_metadata = $record[metadata]['oai_dc:dc']; @@ -169,25 +170,24 @@ if(!is_array($dc_metadata)) { return false; } - + // remove leading "dc:" in field names foreach($dc_metadata as $k => $v) { $dc_metadata[str_replace("dc:", "", $k)] = $v; } - + } else { // OAI 1.1 $dc_metadata = $record[metadata][dc]; - + if(!is_array($dc_metadata)) { return false; } } - - + // the oai identifier $metadata[oai_identifier] = $record[header][identifier]; - + // last modification date of record $metadata[datestamp] = $record[header][datestamp]; if(preg_match('/^(\d\d\d\d\-\d\d\-\d\d)T(\d\d:\d\d:\d\d)Z$/', $metadata[datestamp])) { @@ -195,17 +195,17 @@ } else { $metadata[datestamp] = date("Y-m-d", strtotime($metadata[datestamp])); } - + // record identifier if(is_array($dc_metadata[identifier])) { $metadata[identifier] = $dc_metadata[identifier][0]; } else { $metadata[identifier] = $dc_metadata[identifier]; } - + // record title $metadata[title] = is_array($dc_metadata[title]) ? $dc_metadata[title][0] : $dc_metadata[title]; - + // record authors if(is_array($dc_metadata[creator])) { // record has multiple authors @@ -213,34 +213,34 @@ $metadata[author] = $author_items[0]; $metadata[affiliation] = $author_items[1]; $metadata[email] = $author_items[2]; - + $metadata[add_authors] = array(); $metadata[add_affiliations] = array(); $metadata[add_emails] = array(); - + for($i=1; $i= 1) { $xml_data[record] = array($xml_data[record]); } - + return array($xml_data[record], $xml_data[resumptiontoken]); } -function parseListIdentifiers($oai_url, $protocol, $resumptiontoken = "") { +function parseListIdentifiers($oai_url, $protocol, $setspec = "", $resumptiontoken = "") { $oai_identifiers = array(); $oai_records = array(); - + if(!empty($resumptiontoken)) { $file = $oai_url."?verb=ListIdentifiers&resumptionToken=".$resumptiontoken; - + } else if($protocol == 2) { // OAI 2.0 - $file = $oai_url."?verb=ListIdentifiers&metadataPrefix=oai_dc"; - + $file = $oai_url."?verb=ListIdentifiers&metadataPrefix=oai_dc&set=".$setspec; + } else { // OAI 1.1 $file = $oai_url."?verb=ListIdentifiers"; } - + + if (empty($resumptiontoken) && !empty($setspec)) { + $file .= "&set=".$setspec; + } + $xml_data = parseXML($file); if($protocol == 2) { // OAI 2.0 $xml_data = $xml_data['oai-pmh'][listidentifiers]; - + if(isset($xml_data[header]) && !is_array($xml_data[header])) { array_push($oai_identifiers, $xml_data[header][identifier]); - + } else if(is_array($xml_data[header])) { foreach($xml_data[header] as $identifier) { array_push($oai_identifiers, $identifier[identifier]); } } - + } else { // OAI 1.1 $xml_data = $xml_data[listidentifiers]; $oai_identifiers = array_merge($oai_identifiers, $xml_data[identifier]); } - - + + for($i=0; $i