Commit 9ead2890 authored by tobyink's avatar tobyink

Add arc2, some rearranging.

parent 8da00eed
This diff is collapsed.
<?php
/*
homepage: http://arc.semsol.org/
license: http://arc.semsol.org/license
class: ARC2 base class
author: Benjamin Nowack
version: 2009-02-16 (Tweak: support for "raw" parameter in toTurtle and toRDFXML methods)
*/
class ARC2_Class {
function __construct($a = '', &$caller) {
$a = is_array($a) ? $a : array();
$this->a = $a;
$this->caller = &$caller;
$this->__init();
}
function ARC2_Class($a = '', &$caller) {
$this->__construct($a, $caller);
}
function __init() {/* base, time_limit */
$this->inc_path = ARC2::getIncPath();
$this->base = $this->v('base', ARC2::getScriptURI(), $this->a);
$this->errors = array();
$this->warnings = array();
$this->adjust_utf8 = $this->v('adjust_utf8', 0, $this->a);
}
/* */
function v($name, $default = false, $o = false) {/* value if set */
$o = ($o !== false) ? $o : $this;
if (is_array($o)) {
return isset($o[$name]) ? $o[$name] : $default;
}
return isset($o->$name) ? $o->$name : $default;
}
function v1($name, $default = false, $o = false) {/* value if 1 (= not empty) */
$o = ($o !== false) ? $o : $this;
if (is_array($o)) {
return (isset($o[$name]) && $o[$name]) ? $o[$name] : $default;
}
return (isset($o->$name) && $o->$name) ? $o->$name : $default;
}
function m($name, $a = false, $default = false, $o = false) {/* call method */
$o = ($o !== false) ? $o : $this;
return method_exists($o, $name) ? $o->$name($a) : $default;
}
/* */
function camelCase($v) {
$r = ucfirst($v);
while (preg_match('/^(.*)[\-\_ ](.*)$/', $r, $m)) {
$r = $m[1] . ucfirst($m[2]);
}
return $r;
}
/* */
function addError($v) {
if (!in_array($v, $this->errors)) {
$this->errors[] = $v;
}
if ($this->caller && method_exists($this->caller, 'addError')) {
$glue = strpos($v, ' in ') ? ' via ' : ' in ';
$this->caller->addError($v . $glue . get_class($this));
}
return false;
}
function getErrors() {
return $this->errors;
}
function getWarnings() {
return $this->warnings;
}
/* */
function splitURI($v) {
return ARC2::splitURI($v);
}
/* */
function expandPName($v) {
if (!isset($this->ns) && isset($this->a['ns'])) $this->ns = $this->a['ns'];
if (preg_match('/^([a-z0-9\_\-]+)\:([a-z0-9\_\-]+)$/i', $v, $m) && isset($this->ns[$m[1]])) {
return $this->ns[$m[1]] . $m[2];
}
return $v;
}
function getPName($v, $connector = ':') {
if (!isset($this->ns) && isset($this->a['ns'])) $this->ns = $this->a['ns'];
if ($parts = $this->splitURI($v)) {
foreach ($this->ns as $p => $ns) {
if ($parts[0] == $ns) {
return $p . $connector . $parts[1];
}
}
}
return $v;
}
/* */
function calcURI($path, $base = "") {
/* quick check */
if (preg_match("/^[a-z0-9\_]+\:/i", $path)) {/* abs path or bnode */
return $path;
}
if (preg_match("/^\/\//", $path)) {/* net path, assume http */
return 'http:' . $path;
}
/* other URIs */
$base = $base ? $base : $this->base;
$base = preg_replace('/\#.*$/', '', $base);
if ($path === true) {/* empty (but valid) URIref via turtle parser: <> */
return $base;
}
$path = preg_replace("/^\.\//", '', $path);
$root = preg_match('/(^[a-z0-9]+\:[\/]{1,2}[^\/]+)[\/|$]/i', $base, $m) ? $m[1] : $base; /* w/o trailing slash */
$base .= ($base == $root) ? '/' : '';
if (preg_match('/^\//', $path)) {/* leading slash */
return $root . $path;
}
if (!$path) {
return $base;
}
if (preg_match('/^([\#\?])/', $path, $m)) {
return preg_replace('/\\' .$m[1]. '.*$/', '', $base) . $path;
}
if (preg_match('/^(\&)(.*)$/', $path, $m)) {/* not perfect yet */
return preg_match('/\?/', $base) ? $base . $m[1] . $m[2] : $base . '?' . $m[2];
}
if (preg_match("/^[a-z0-9]+\:/i", $path)) {/* abs path */
return $path;
}
/* rel path: remove stuff after last slash */
$base = substr($base, 0, strrpos($base, '/')+1);
/* resolve ../ */
while (preg_match('/^(\.\.\/)(.*)$/', $path, $m)) {
$path = $m[2];
$base = ($base == $root.'/') ? $base : preg_replace('/^(.*\/)[^\/]+\/$/', '\\1', $base);
}
return $base . $path;
}
/* */
function calcBase($path) {
$r = $path;
$r = preg_replace('/\#.*$/', '', $r);/* remove hash */
$r = preg_replace('/^\/\//', 'http://', $r);/* net path (//), assume http */
if (preg_match('/^[a-z0-9]+\:/', $r)) {/* scheme, abs path */
while (preg_match('/^(.+\/)(\.\.\/.*)$/U', $r, $m)) {
$r = $this->calcURI($m[1], $m[2]);
}
return $r;
}
return 'file://' . realpath($r);/* real path */
}
/* */
function toNTriples($v, $ns = '', $raw = 0) {
ARC2::inc('NTriplesSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_NTriplesSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return (isset($v[0]) && isset($v[0]['s'])) ? $ser->getSerializedTriples($v, $raw) : $ser->getSerializedIndex($v, $raw);
}
function toTurtle($v, $ns = '', $raw = 0) {
ARC2::inc('TurtleSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_TurtleSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return (isset($v[0]) && isset($v[0]['s'])) ? $ser->getSerializedTriples($v, $raw) : $ser->getSerializedIndex($v, $raw);
}
function toRDFXML($v, $ns = '', $raw = 0) {
ARC2::inc('RDFXMLSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_RDFXMLSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return (isset($v[0]) && isset($v[0]['s'])) ? $ser->getSerializedTriples($v, $raw) : $ser->getSerializedIndex($v, $raw);
}
function toRDFJSON($v, $ns = '') {
ARC2::inc('RDFJSONSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_RDFJSONSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return (isset($v[0]) && isset($v[0]['s'])) ? $ser->getSerializedTriples($v) : $ser->getSerializedIndex($v);
}
function toLegacyXML($v, $ns = '') {
ARC2::inc('LegacyXMLSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_LegacyXMLSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return $ser->getSerializedArray($v);
}
function toLegacyJSON($v, $ns = '') {
ARC2::inc('LegacyJSONSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_LegacyJSONSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return $ser->getSerializedArray($v);
}
function toLegacyHTML($v, $ns = '') {
ARC2::inc('LegacyHTMLSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_LegacyHTMLSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return $ser->getSerializedArray($v);
}
function toHTML($v, $ns = '') {
ARC2::inc('POSHRDFSerializer');
if (!$ns) $ns = isset($this->a['ns']) ? $this->a['ns'] : array();
$ser = new ARC2_POSHRDFSerializer(array_merge($this->a, array('ns' => $ns)), $this);
return (isset($v[0]) && isset($v[0]['s'])) ? $ser->getSerializedTriples($v) : $ser->getSerializedIndex($v);
}
/* */
function getFilledTemplate($t, $vals, $g = '') {
$parser = ARC2::getTurtleParser();
$parser->parse($g, $this->getTurtleHead() . $t);
return $parser->getSimpleIndex(0, $vals);
}
function getTurtleHead() {
$r = '';
$ns = $this->v('ns', array(), $this->a);
foreach ($ns as $k => $v) {
$r .= "@prefix " . $k . ": <" .$v. "> .\n";
}
return $r;
}
/* */
function toUTF8($v) {
return $this->adjust_utf8 ? ARC2::toUTF8($v) : $v;
}
/* */
}
This diff is collapsed.
<?php
/*
homepage: http://arc.semsol.org/
license: http://arc.semsol.org/license
function: format detection
author: Benjamin Nowack
version: 2009-02-09 (Tweak: basic multi-line ntriples/turtle is now supported)
*/
function ARC2_getFormat($v, $mtype = '', $ext = '') {
$r = false;
/* mtype check (atom, rdf/xml, turtle, n3, mp3, jpg) */
$r = (!$r && preg_match('/\/atom\+xml/', $mtype)) ? 'atom' : $r;
$r = (!$r && preg_match('/\/rdf\+xml/', $mtype)) ? 'rdfxml' : $r;
$r = (!$r && preg_match('/\/(x\-)?turtle/', $mtype)) ? 'turtle' : $r;
$r = (!$r && preg_match('/\/rdf\+n3/', $mtype)) ? 'n3' : $r;
/* xml sniffing */
if (!$r && preg_match('/^\s*\<[^\s]/s', $v) && (preg_match('/\<\/[a-z0-9\_\:\-]+\>/i', $v) || preg_match('/\sxmlns\:?/', $v))) {
while (preg_match('/^\s*\<\?xml[^\r\n]+\?\>\s*/s', $v)) {
$v = preg_replace('/^\s*\<\?xml[^\r\n]+\?\>\s*/s', '', $v);
}
while (preg_match('/^\s*\<\!--.+?--\>\s*/s', $v)) {
$v = preg_replace('/^\s*\<\!--.+?--\>\s*/s', '', $v);
}
/* doctype checks (html, rdf) */
$r = (!$r && preg_match('/^\s*\<\!DOCTYPE\s+html[\s|\>]/is', $v)) ? 'html' : $r;
$r = (!$r && preg_match('/^\s*\<\!DOCTYPE\s+[a-z0-9\_\-]\:RDF\s/is', $v)) ? 'rdfxml' : $r;
/* markup checks */
$v = preg_replace('/^\s*\<\!DOCTYPE\s.*\]\>/is', '', $v);
$r = (!$r && preg_match('/^\s*\<rss\s+[^\>]*version/s', $v)) ? 'rss' : $r;
$r = (!$r && preg_match('/^\s*\<feed\s+[^\>]+http\:\/\/www\.w3\.org\/2005\/Atom/s', $v)) ? 'atom' : $r;
$r = (!$r && preg_match('/^\s*\<opml\s/s', $v)) ? 'opml' : $r;
$r = (!$r && preg_match('/^\s*\<html[\s|\>]/is', $v)) ? 'html' : $r;
$r = (!$r && preg_match('/^\s*\<sparql\s+[^\>]+http\:\/\/www\.w3\.org\/2005\/sparql\-results\#/s', $v)) ? 'sparqlxml' : $r;
$r = (!$r && preg_match('/^\s*\<[^\>]+http\:\/\/www\.w3\.org\/2005\/sparql\-results#/s', $v)) ? 'srx' : $r;
$r = (!$r && preg_match('/^\s*\<[^\s]*RDF[\s\>]/s', $v)) ? 'rdfxml' : $r;
$r = (!$r && preg_match('/^\s*\<[^\>]+http\:\/\/www\.w3\.org\/1999\/02\/22\-rdf/s', $v)) ? 'rdfxml' : $r;
$r = !$r ? 'xml' : $r;
}
/* json|jsonp */
if (!$r && preg_match('/^[a-z0-9\.\(]*\s*[\{\[].*/s', trim($v))) {
/* google social graph api */
$r = (!$r && preg_match('/\"canonical_mapping\"/', $v)) ? 'sgajson' : $r;
/* crunchbase api */
$r = (!$r && preg_match('/\"permalink\"/', $v)) ? 'cbjson' : $r;
$r = !$r ? 'json' : $r;
}
/* turtle/n3 */
$r = (!$r && preg_match('/\@(prefix|base)/i', $v)) ? 'turtle' : $r;
$r = (!$r && preg_match('/^(ttl)$/', $ext)) ? 'turtle' : $r;
$r = (!$r && preg_match('/^(n3)$/', $ext)) ? 'n3' : $r;
/* ntriples */
$r = (!$r && preg_match('/^\s*(_:|<).+?\s+<[^>]+?>\s+\S.+?\s*\.\s*$/sm', $v)) ? 'ntriples' : $r;
$r = (!$r && preg_match('/^(nt)$/', $ext)) ? 'ntriples' : $r;
return $r;
}
<?php
/*
homepage: http://arc.semsol.org/
license: http://arc.semsol.org/license
function: result format detection
author: Benjamin Nowack
version: 2008-08-04
*/
function ARC2_getPreferredFormat($default = 'plain') {
$formats = array(
'html' => 'HTML', 'text/html' => 'HTML', 'xhtml+xml' => 'HTML',
'rdfxml' => 'RDFXML', 'rdf+xml' => 'RDFXML',
'ntriples' => 'NTriples', 'rdf+n3' => 'Turtle', 'x-turtle' => 'Turtle', 'turtle' => 'Turtle',
'rdfjson' => 'RDFJSON', 'json' => 'RDFJSON',
'xml' => 'XML',
'legacyjson' => 'LegacyJSON'
);
$prefs = array();
$o_vals = array();
/* accept header */
if ($vals = explode(',', $_SERVER['HTTP_ACCEPT'])) {
foreach ($vals as $val) {
if (preg_match('/(rdf\+n3|x\-turtle|rdf\+xml|text\/html|xhtml\+xml|xml|json)/', $val, $m)) {
$o_vals[$m[1]] = 1;
if (preg_match('/\;q\=([0-9\.]+)/', $val, $sub_m)) {
$o_vals[$m[1]] = 1 * $sub_m[1];
}
}
}
}
/* arg */
if (isset($_GET['format'])) $o_vals[$_GET['format']] = 1.1;
/* rank */
arsort($o_vals);
foreach ($o_vals as $val => $prio) {
$prefs[] = $val;
}
/* default */
$prefs[] = $default;
foreach ($prefs as $pref) {
if (isset($formats[$pref])) {
return $formats[$pref];
}
}
}
<?php
/*
homepage: http://arc.semsol.org/
license: http://arc.semsol.org/license
class: ARC2 DC Extractor
author: Benjamin Nowack
version: 2008-04-09 (Fix: base URL (not doc URL) was used for annotations)
*/
ARC2::inc('RDFExtractor');
class ARC2_DcExtractor extends ARC2_RDFExtractor {
function __construct($a = '', &$caller) {
parent::__construct($a, $caller);
}
function ARC2_DcExtractor($a = '', &$caller) {
$this->__construct($a, $caller);
}
function __init() {
parent::__init();
$this->a['ns']['dc'] = 'http://purl.org/dc/elements/1.1/';
}
/* */
function extractRDF() {
$t_vals = array();
$t = '';
foreach ($this->nodes as $n) {
foreach (array('title', 'link', 'meta') as $tag) {
if ($n['tag'] == $tag) {
$m = 'extract' . ucfirst($tag);
list ($t_vals, $t) = $this->$m($n, $t_vals, $t);
}
}
}
if ($t) {
$doc = $this->getFilledTemplate($t, $t_vals, $n['doc_base']);
$this->addTs(ARC2::getTriplesFromIndex($doc));
}
}
/* */
function extractTitle($n, $t_vals, $t) {
if ($t_vals['title'] = $this->getPlainContent($n)) {
$t .= '<' . $n['doc_url'] . '> dc:title ?title . ';
}
return array($t_vals, $t);
}
/* */
function extractLink($n, $t_vals, $t) {
if ($this->hasRel($n, 'alternate') || $this->hasRel($n, 'meta')) {
if ($href = $this->v('href uri', '', $n['a'])) {
$t .= '<' . $n['doc_url'] . '> rdfs:seeAlso <' . $href . '> . ';
if ($v = $this->v('type', '', $n['a'])) {
$t .= '<' .$href. '> dc:format "' . $v . '" . ';
}
if ($v = $this->v('title', '', $n['a'])) {
$t .= '<' .$href. '> dc:title "' . $v . '" . ';
}
}
}
return array($t_vals, $t);
}
function extractMeta($n, $t_vals, $t) {
if ($this->hasAttribute('http-equiv', $n, 'Content-Type') || $this->hasAttribute('http-equiv', $n, 'content-type')) {
if ($v = $this->v('content', '', $n['a'])) {
$t .= '<' . $n['doc_url'] . '> dc:format "' . $v . '" . ';
}
}
return array($t_vals, $t);
}
/* */
}
<?php
/*
homepage: http://arc.semsol.org/
license: http://arc.semsol.org/license
class: ARC2 eRDF Extractor (w/o link title generation)
author: Benjamin Nowack
version: 2009-02-09 (Tweak: getRootNode returns 1st node if html tag is not found)
*/
ARC2::inc('RDFExtractor');
class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
function __construct($a = '', &$caller) {
parent::__construct($a, $caller);
}
function ARC2_ErdfExtractor($a = '', &$caller) {
$this->__construct($a, $caller);
}
function __init() {
parent::__init();
}
/* */
function extractRDF() {
if (!isset($this->caller->detected_formats['erdf'])) return 0;
$root_node = $this->getRootNode();
$base = $this->getDocBase();
$ns = $this->getNamespaces();
$context = array(
'base' => $base,
'prev_res' => $base,
'cur_res' => $base,
'ns' => $ns,
'lang' => '',
);
$this->processNode($root_node, $context);
}
/* */
function getRootNode() {
foreach ($this->nodes as $id => $node) {
if ($node['tag'] == 'html') {
return $node;
}
}
return $this->nodes[0];
}
function getNamespaces() {
$r = array(
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#'
);
foreach ($this->nodes as $id => $node) {
if (preg_match('/^(link|a)$/', $node['tag']) && isset($node['a']['rel']) && preg_match('/schema\.([^\s]+)/is', $node['a']['rel'], $m) && isset($node['a']['href uri'])) {
$r[$m[1]] = $node['a']['href uri'];
}
}
return $r;
}
/* */
function processNode($n, $ct) {
/* context */
//$ct['lang'] = $this->v('xml:lang', $ct['lang'], $n['a']);
$ct['lang'] = '';
$ct['prop_uris'] = $this->getPropertyURIs($n, $ct);
$ct['prev_res'] = $ct['cur_res'];
$ct['cur_res'] = $this->getCurrentResourceURI($n, $ct);
$ct['cur_obj_id'] = $this->getCurrentObjectID($n, $ct);
$ct['cur_obj_literal'] = $this->getCurrentObjectLiteral($n, $ct);
/* triple production (http://research.talis.com/2005/erdf/wiki/Main/SummaryOfTripleProductionRules) */
foreach ($ct['prop_uris'] as $type => $uris) {
foreach ($uris as $uri) {
$rdf_type = preg_match('/^ /', $uri) ? 1 : 0;
/* meta + name */
if (($type == 'name') && ($n['tag'] == 'meta')) {
$t = array(
's' => $ct['cur_res'],
's_type' => 'uri',
'p' => $uri,
'o' => $ct['cur_obj_literal']['value'],
'o_type' => 'literal',
'o_lang' => $ct['cur_obj_literal']['datatype'] ? '' : $ct['cur_obj_literal']['lang'],
'o_datatype' => $ct['cur_obj_literal']['datatype'],
);
$this->addT($t);
}
/* class */
if ($type == 'class') {
if ($rdf_type) {
$s = $this->v('href uri', $ct['cur_res'], $n['a']);
$s = $this->v('src uri', $s, $n['a']);
$t = array(
's' => $s,
's_type' => 'uri',
'p' => $ct['ns']['rdf'] . 'type',
'o' => trim($uri),
'o_type' => 'uri',
'o_lang' => '',
'o_datatype' => '',
);
}
elseif (isset($n['a']['id'])) {/* used as object */
$t = array(
's' => $ct['prev_res'],
's_type' => 'uri',
'p' => $uri,
'o' => $ct['cur_res'],
'o_type' => 'uri',
'o_lang' => '',
'o_datatype' => '',
);
}
else {
$t = array(
's' => $ct['cur_res'],
's_type' => 'uri',
'p' => $uri,
'o' => $ct['cur_obj_literal']['value'],
'o_type' => 'literal',
'o_lang' => $ct['cur_obj_literal']['datatype'] ? '' : $ct['cur_obj_literal']['lang'],
'o_datatype' => $ct['cur_obj_literal']['datatype'],
);
if (($o = $this->v('src uri', '', $n['a'])) || ($o = $this->v('href uri', '', $n['a']))) {
if (!$ct['prop_uris']['rel'] && !$ct['prop_uris']['rev']) {
$t['o'] = $o;
$t['o_type'] = 'uri';
$t['o_lang'] = '';
$t['o_datatype'] = '';
}
}
}
$this->addT($t);
}
/* rel */
if ($type == 'rel') {
if (($o = $this->v('src uri', '', $n['a'])) || ($o = $this->v('href uri', '', $n['a']))) {
$t = array(
's' => $ct['cur_res'],
's_type' => 'uri',
'p' => $uri,
'o' => $o,
'o_type' => 'uri',
'o_lang' => '',
'o_datatype' => '',
);
$this->addT($t);
}
}
/* rev */
if ($type == 'rev') {
if (($s = $this->v('src uri', '', $n['a'])) || ($s = $this->v('href uri', '', $n['a']))) {
$t = array(
's' => $s,
's_type' => 'uri',
'p' => $uri,
'o' => $ct['cur_res'],
'o_type' => 'uri',
'o_lang' => '',
'o_datatype' => '',
);
$this->addT($t);