We are no longer offering accounts on this server. Consider https://gitlab.freedesktop.org/ as a place to host projects.

Commit 071e621d authored by Mike Sheldon's avatar Mike Sheldon

Update ARC2 (Not massively with this being kept in tree though...)

parent 1f385475
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
<?php
/**
* ARC2 Resource object
*
* @author Benjamin Nowack <bnowack@semsol.com>
* @license http://arc.semsol.org/license
* @homepage <http://arc.semsol.org/>
* @package ARC2
* @version 2010-11-16
*/
ARC2::inc('Class');
class ARC2_Resource extends ARC2_Class {
function __construct($a, &$caller) {
parent::__construct($a, $caller);
}
function __init() {
parent::__init();
$this->uri = '';
$this->index = array();
$this->fetched = array();
$this->store = '';
}
/* */
function setURI($uri) {
$this->uri = $uri;
}
function setIndex($index) {
$this->index = $index;
}
function setProps($props, $s = '') {
if (!$s) $s = $this->uri;
$this->index[$s] = $props;
}
function setProp($p, $os, $s = '') {
if (!$s) $s = $this->uri;
/* single plain value */
if (!is_array($os)) $os = array('value' => $os, 'type' => 'literal');
/* single array value */
if (isset($os['value'])) $os = array($os);
/* list of values */
foreach ($os as $i => $o) {
if (!is_array($o)) $os[$i] = array('value' => $o, 'type' => 'literal');
}
$this->index[$s][$this->expandPName($p)] = $os;
}
function setStore($store) {
$this->store = $store;
}
/* */
function fetchData($uri = '') {
if (!$uri) $uri = $this->uri;
if (!$uri) return 0;
if (in_array($uri, $this->fetched)) return 0;
$this->index[$uri] = array();
if ($this->store) {
$index = $this->store->query('CONSTRUCT { <' . $uri . '> ?p ?o . } WHERE { <' . $uri . '> ?p ?o . } ', 'raw');
}
else {
$index = $this->toIndex($uri);
}
$this->index = ARC2::getMergedIndex($this->index, $index);
$this->fetched[] = $uri;
}
/* */
function getProps($p = '', $s = '') {
if (!$s) $s = $this->uri;
if (!$s) return array();
if (!isset($this->index[$s])) $this->fetchData($s);
if (!$p) return $this->index[$s];
return $this->v($this->expandPName($p), array(), $this->index[$s]);
}
function getProp($p, $s = '') {
$props = $this->getProps($p, $s);
return $props ? $props[0] : '';
}
function getPropValue($p, $s = '') {
$prop = $this->getProp($p, $s);
return $prop ? $prop['value'] : '';
}
function getPropValues($p, $s = '') {
$r = array();
$props = $this->getProps($p, $s);
foreach ($props as $prop) {
$r[] = $prop['value'];
}
return $r;
}
function hasPropValue($p, $o, $s = '') {
$props = $this->getProps($p, $s);
$o = $this->expandPName($o);
foreach ($props as $prop) {
if ($prop['value'] == $o) return 1;
}
return 0;
}
/* */
}
This diff is collapsed.
<?php
/*
homepage: http://arc.semsol.org/
license: http://arc.semsol.org/license
function: format detection
author: Benjamin Nowack
version: 2009-02-09 (Tweak: basic multi-line ntriples/turtle is now supported)
*/
function ARC2_getFormat($v, $mtype = '', $ext = '') {
$r = false;
/* mtype check (atom, rdf/xml, turtle, n3, mp3, jpg) */
$r = (!$r && preg_match('/\/atom\+xml/', $mtype)) ? 'atom' : $r;
$r = (!$r && preg_match('/\/rdf\+xml/', $mtype)) ? 'rdfxml' : $r;
$r = (!$r && preg_match('/\/(x\-)?turtle/', $mtype)) ? 'turtle' : $r;
$r = (!$r && preg_match('/\/rdf\+n3/', $mtype)) ? 'n3' : $r;
/* xml sniffing */
if (!$r && preg_match('/^\s*\<[^\s]/s', $v) && (preg_match('/\<\/[a-z0-9\_\:\-]+\>/i', $v) || preg_match('/\sxmlns\:?/', $v))) {
while (preg_match('/^\s*\<\?xml[^\r\n]+\?\>\s*/s', $v)) {
$v = preg_replace('/^\s*\<\?xml[^\r\n]+\?\>\s*/s', '', $v);
}
while (preg_match('/^\s*\<\!--.+?--\>\s*/s', $v)) {
$v = preg_replace('/^\s*\<\!--.+?--\>\s*/s', '', $v);
}
/* doctype checks (html, rdf) */
$r = (!$r && preg_match('/^\s*\<\!DOCTYPE\s+html[\s|\>]/is', $v)) ? 'html' : $r;
$r = (!$r && preg_match('/^\s*\<\!DOCTYPE\s+[a-z0-9\_\-]\:RDF\s/is', $v)) ? 'rdfxml' : $r;
/* markup checks */
$v = preg_replace('/^\s*\<\!DOCTYPE\s.*\]\>/is', '', $v);
$r = (!$r && preg_match('/^\s*\<rss\s+[^\>]*version/s', $v)) ? 'rss' : $r;
$r = (!$r && preg_match('/^\s*\<feed\s+[^\>]+http\:\/\/www\.w3\.org\/2005\/Atom/s', $v)) ? 'atom' : $r;
$r = (!$r && preg_match('/^\s*\<opml\s/s', $v)) ? 'opml' : $r;
$r = (!$r && preg_match('/^\s*\<html[\s|\>]/is', $v)) ? 'html' : $r;
$r = (!$r && preg_match('/^\s*\<sparql\s+[^\>]+http\:\/\/www\.w3\.org\/2005\/sparql\-results\#/s', $v)) ? 'sparqlxml' : $r;
$r = (!$r && preg_match('/^\s*\<[^\>]+http\:\/\/www\.w3\.org\/2005\/sparql\-results#/s', $v)) ? 'srx' : $r;
$r = (!$r && preg_match('/^\s*\<[^\s]*RDF[\s\>]/s', $v)) ? 'rdfxml' : $r;
$r = (!$r && preg_match('/^\s*\<[^\>]+http\:\/\/www\.w3\.org\/1999\/02\/22\-rdf/s', $v)) ? 'rdfxml' : $r;
$r = !$r ? 'xml' : $r;
}
/* json|jsonp */
if (!$r && preg_match('/^[a-z0-9\.\(]*\s*[\{\[].*/s', trim($v))) {
/* google social graph api */
$r = (!$r && preg_match('/\"canonical_mapping\"/', $v)) ? 'sgajson' : $r;
/* crunchbase api */
$r = (!$r && preg_match('/\"permalink\"/', $v)) ? 'cbjson' : $r;
$r = !$r ? 'json' : $r;
}
/* turtle/n3 */
$r = (!$r && preg_match('/\@(prefix|base)/i', $v)) ? 'turtle' : $r;
$r = (!$r && preg_match('/^(ttl)$/', $ext)) ? 'turtle' : $r;
$r = (!$r && preg_match('/^(n3)$/', $ext)) ? 'n3' : $r;
/* ntriples */
$r = (!$r && preg_match('/^\s*(_:|<).+?\s+<[^>]+?>\s+\S.+?\s*\.\s*$/sm', $v)) ? 'ntriples' : $r;
$r = (!$r && preg_match('/^(nt)$/', $ext)) ? 'ntriples' : $r;
return $r;
}
<?php
/**
* ARC2 format detection function
*
* @author Benjamin Nowack <bnowack@semsol.com>
* @license http://arc.semsol.org/license
* @package ARC2
* @version 2010-11-16
*/
function ARC2_getFormat($v, $mtype = '', $ext = '') {
$r = false;
/* mtype check (atom, rdf/xml, turtle, n3, mp3, jpg) */
$r = (!$r && preg_match('/\/atom\+xml/', $mtype)) ? 'atom' : $r;
$r = (!$r && preg_match('/\/rdf\+xml/', $mtype)) ? 'rdfxml' : $r;
$r = (!$r && preg_match('/\/(x\-)?turtle/', $mtype)) ? 'turtle' : $r;
$r = (!$r && preg_match('/\/rdf\+n3/', $mtype)) ? 'n3' : $r;
$r = (!$r && preg_match('/\/sparql-results\+xml/', $mtype)) ? 'sparqlxml' : $r;
/* xml sniffing */
if (
!$r &&
/* starts with angle brackets */
preg_match('/^\s*\<[^\s]/s', $v) &&
/* has an xmlns:* declaration or a matching pair of tags */
(preg_match('/\sxmlns\:?/', $v) || preg_match('/\<([^\s]+).+\<\/\\1\>/s', $v)) &&
/* not a typical ntriples/turtle/n3 file */
!preg_match('/[\>\"\']\s*\.\s*$/s', $v)
) {
while (preg_match('/^\s*\<\?xml[^\r\n]+\?\>\s*/s', $v)) {
$v = preg_replace('/^\s*\<\?xml[^\r\n]+\?\>\s*/s', '', $v);
}
while (preg_match('/^\s*\<\!--.+?--\>\s*/s', $v)) {
$v = preg_replace('/^\s*\<\!--.+?--\>\s*/s', '', $v);
}
/* doctype checks (html, rdf) */
$r = (!$r && preg_match('/^\s*\<\!DOCTYPE\s+html[\s|\>]/is', $v)) ? 'html' : $r;
$r = (!$r && preg_match('/^\s*\<\!DOCTYPE\s+[a-z0-9\_\-]\:RDF\s/is', $v)) ? 'rdfxml' : $r;
/* markup checks */
$v = preg_replace('/^\s*\<\!DOCTYPE\s.*\]\>/is', '', $v);
$r = (!$r && preg_match('/^\s*\<rss\s+[^\>]*version/s', $v)) ? 'rss' : $r;
$r = (!$r && preg_match('/^\s*\<feed\s+[^\>]+http\:\/\/www\.w3\.org\/2005\/Atom/s', $v)) ? 'atom' : $r;
$r = (!$r && preg_match('/^\s*\<opml\s/s', $v)) ? 'opml' : $r;
$r = (!$r && preg_match('/^\s*\<html[\s|\>]/is', $v)) ? 'html' : $r;
$r = (!$r && preg_match('/^\s*\<sparql\s+[^\>]+http\:\/\/www\.w3\.org\/2005\/sparql\-results\#/s', $v)) ? 'sparqlxml' : $r;
$r = (!$r && preg_match('/^\s*\<[^\>]+http\:\/\/www\.w3\.org\/2005\/sparql\-results#/s', $v)) ? 'srx' : $r;
$r = (!$r && preg_match('/^\s*\<[^\s]*RDF[\s\>]/s', $v)) ? 'rdfxml' : $r;
$r = (!$r && preg_match('/^\s*\<[^\>]+http\:\/\/www\.w3\.org\/1999\/02\/22\-rdf/s', $v)) ? 'rdfxml' : $r;
$r = !$r ? 'xml' : $r;
}
/* json|jsonp */
if (!$r && preg_match('/^[a-z0-9\.\(]*\s*[\{\[].*/s', trim($v))) {
/* google social graph api */
$r = (!$r && preg_match('/\"canonical_mapping\"/', $v)) ? 'sgajson' : $r;
/* crunchbase api */
$r = (!$r && preg_match('/\"permalink\"/', $v)) ? 'cbjson' : $r;
$r = !$r ? 'json' : $r;
}
/* turtle/n3 */
$r = (!$r && preg_match('/\@(prefix|base)/i', $v)) ? 'turtle' : $r;
$r = (!$r && preg_match('/^(ttl)$/', $ext)) ? 'turtle' : $r;
$r = (!$r && preg_match('/^(n3)$/', $ext)) ? 'n3' : $r;
/* ntriples */
$r = (!$r && preg_match('/^\s*(_:|<).+?\s+<[^>]+?>\s+\S.+?\s*\.\s*$/sm', $v)) ? 'ntriples' : $r;
$r = (!$r && preg_match('/^(nt)$/', $ext)) ? 'ntriples' : $r;
return $r;
}
<?php
/*
homepage: http://arc.semsol.org/
license: http://arc.semsol.org/license
function: result format detection
author: Benjamin Nowack
version: 2008-08-04
*/
function ARC2_getPreferredFormat($default = 'plain') {
$formats = array(
'html' => 'HTML', 'text/html' => 'HTML', 'xhtml+xml' => 'HTML',
'rdfxml' => 'RDFXML', 'rdf+xml' => 'RDFXML',
'ntriples' => 'NTriples', 'rdf+n3' => 'Turtle', 'x-turtle' => 'Turtle', 'turtle' => 'Turtle',
'rdfjson' => 'RDFJSON', 'json' => 'RDFJSON',
'xml' => 'XML',
'legacyjson' => 'LegacyJSON'
);
$prefs = array();
$o_vals = array();
/* accept header */
if ($vals = explode(',', $_SERVER['HTTP_ACCEPT'])) {
foreach ($vals as $val) {
if (preg_match('/(rdf\+n3|x\-turtle|rdf\+xml|text\/html|xhtml\+xml|xml|json)/', $val, $m)) {
$o_vals[$m[1]] = 1;
if (preg_match('/\;q\=([0-9\.]+)/', $val, $sub_m)) {
$o_vals[$m[1]] = 1 * $sub_m[1];
}
}
}
}
/* arg */
if (isset($_GET['format'])) $o_vals[$_GET['format']] = 1.1;
/* rank */
arsort($o_vals);
foreach ($o_vals as $val => $prio) {
$prefs[] = $val;
}
/* default */
$prefs[] = $default;
foreach ($prefs as $pref) {
if (isset($formats[$pref])) {
return $formats[$pref];
}
}
}
<?php
/**
* ARC2 result format detection
*
* @author Benjamin Nowack
* @license <http://arc.semsol.org/license>
* @homepage <http://arc.semsol.org/>
* @package ARC2
* @version 2010-11-16
*/
function ARC2_getPreferredFormat($default = 'plain') {
$formats = array(
'html' => 'HTML', 'text/html' => 'HTML', 'xhtml+xml' => 'HTML',
'rdfxml' => 'RDFXML', 'rdf+xml' => 'RDFXML',
'ntriples' => 'NTriples',
'rdf+n3' => 'Turtle', 'x-turtle' => 'Turtle', 'turtle' => 'Turtle', 'text/turtle' => 'Turtle',
'rdfjson' => 'RDFJSON', 'json' => 'RDFJSON',
'xml' => 'XML',
'legacyjson' => 'LegacyJSON'
);
$prefs = array();
$o_vals = array();
/* accept header */
$vals = explode(',', $_SERVER['HTTP_ACCEPT']);
if ($vals) {
foreach ($vals as $val) {
if (preg_match('/(rdf\+n3|(x\-|text\/)turtle|rdf\+xml|text\/html|xhtml\+xml|xml|json)/', $val, $m)) {
$o_vals[$m[1]] = 1;
if (preg_match('/\;q\=([0-9\.]+)/', $val, $sub_m)) {
$o_vals[$m[1]] = 1 * $sub_m[1];
}
}
}
}
/* arg */
if (isset($_GET['format'])) $o_vals[$_GET['format']] = 1.1;
/* rank */
arsort($o_vals);
foreach ($o_vals as $val => $prio) {
$prefs[] = $val;
}
/* default */
$prefs[] = $default;
foreach ($prefs as $pref) {
if (isset($formats[$pref])) {
return $formats[$pref];
}
}
}
......@@ -5,28 +5,24 @@ license: http://arc.semsol.org/license
class: ARC2 DC Extractor
author: Benjamin Nowack
version: 2008-04-09 (Fix: base URL (not doc URL) was used for annotations)
version: 2010-11-16
*/
ARC2::inc('RDFExtractor');
class ARC2_DcExtractor extends ARC2_RDFExtractor {
function __construct($a = '', &$caller) {
function __construct($a, &$caller) {
parent::__construct($a, $caller);
}
function ARC2_DcExtractor($a = '', &$caller) {
$this->__construct($a, $caller);
}
function __init() {
parent::__init();
$this->a['ns']['dc'] = 'http://purl.org/dc/elements/1.1/';
}
/* */
function extractRDF() {
$t_vals = array();
$t = '';
......@@ -43,7 +39,7 @@ class ARC2_DcExtractor extends ARC2_RDFExtractor {
$this->addTs(ARC2::getTriplesFromIndex($doc));
}
}
/* */
function extractTitle($n, $t_vals, $t) {
......@@ -52,7 +48,7 @@ class ARC2_DcExtractor extends ARC2_RDFExtractor {
}
return array($t_vals, $t);
}
/* */
function extractLink($n, $t_vals, $t) {
......@@ -69,7 +65,7 @@ class ARC2_DcExtractor extends ARC2_RDFExtractor {
}
return array($t_vals, $t);
}
function extractMeta($n, $t_vals, $t) {
if ($this->hasAttribute('http-equiv', $n, 'Content-Type') || $this->hasAttribute('http-equiv', $n, 'content-type')) {
if ($v = $this->v('content', '', $n['a'])) {
......@@ -78,7 +74,7 @@ class ARC2_DcExtractor extends ARC2_RDFExtractor {
}
return array($t_vals, $t);
}
/* */
}
......@@ -5,27 +5,23 @@ license: http://arc.semsol.org/license
class: ARC2 eRDF Extractor (w/o link title generation)
author: Benjamin Nowack
version: 2009-02-09 (Tweak: getRootNode returns 1st node if html tag is not found)
version: 2010-11-16
*/
ARC2::inc('RDFExtractor');
class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
function __construct($a = '', &$caller) {
function __construct($a, &$caller) {
parent::__construct($a, $caller);
}
function ARC2_ErdfExtractor($a = '', &$caller) {
$this->__construct($a, $caller);
}
function __init() {
parent::__init();
}
/* */
function extractRDF() {
if (!isset($this->caller->detected_formats['erdf'])) return 0;
$root_node = $this->getRootNode();
......@@ -40,9 +36,9 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
);
$this->processNode($root_node, $context);
}
/* */
function getRootNode() {
foreach ($this->nodes as $id => $node) {
if ($node['tag'] == 'html') {
......@@ -51,7 +47,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
}
return $this->nodes[0];
}
function getNamespaces() {
$r = array(
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
......@@ -66,7 +62,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
}
/* */
function processNode($n, $ct) {
/* context */
//$ct['lang'] = $this->v('xml:lang', $ct['lang'], $n['a']);
......@@ -85,7 +81,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $ct['cur_res'],
's_type' => 'uri',
'p' => $uri,
'p' => $uri,
'o' => $ct['cur_obj_literal']['value'],
'o_type' => 'literal',
'o_lang' => $ct['cur_obj_literal']['datatype'] ? '' : $ct['cur_obj_literal']['lang'],
......@@ -101,7 +97,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $s,
's_type' => 'uri',
'p' => $ct['ns']['rdf'] . 'type',
'p' => $ct['ns']['rdf'] . 'type',
'o' => trim($uri),
'o_type' => 'uri',
'o_lang' => '',
......@@ -112,7 +108,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $ct['prev_res'],
's_type' => 'uri',
'p' => $uri,
'p' => $uri,
'o' => $ct['cur_res'],
'o_type' => 'uri',
'o_lang' => '',
......@@ -123,7 +119,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $ct['cur_res'],
's_type' => 'uri',
'p' => $uri,
'p' => $uri,
'o' => $ct['cur_obj_literal']['value'],
'o_type' => 'literal',
'o_lang' => $ct['cur_obj_literal']['datatype'] ? '' : $ct['cur_obj_literal']['lang'],
......@@ -146,7 +142,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $ct['cur_res'],
's_type' => 'uri',
'p' => $uri,
'p' => $uri,
'o' => $o,
'o_type' => 'uri',
'o_lang' => '',
......@@ -161,7 +157,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $s,
's_type' => 'uri',
'p' => $uri,
'p' => $uri,
'o' => $ct['cur_res'],
'o_type' => 'uri',
'o_lang' => '',
......@@ -178,7 +174,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $s,
's_type' => 'uri',
'p' => $ct['ns']['rdfs'] . 'label',
'p' => $ct['ns']['rdfs'] . 'label',
'o' => $ct['cur_obj_literal']['value'],
'o_type' => 'literal',
'o_lang' => $ct['cur_obj_literal']['datatype'] ? '' : $ct['cur_obj_literal']['lang'],
......@@ -193,7 +189,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
$t = array(
's' => $s,
's_type' => 'uri',
'p' => $ct['ns']['rdfs'] . 'label',
'p' => $ct['ns']['rdfs'] . 'label',
'o' => $ct['cur_obj_literal']['value'],
'o_type' => 'literal',
'o_lang' => $ct['cur_obj_literal']['datatype'] ? '' : $ct['cur_obj_literal']['lang'],
......@@ -213,7 +209,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
}
/* */
function getPropertyURIs($n, $ct) {
$r = array();
foreach (array('rel', 'rev', 'class', 'name', 'src') as $type) {
......@@ -237,7 +233,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
}
return $ct['cur_res'];
}
function getCurrentObjectID($n, $ct) {
foreach (array('href', 'src') as $a) {
if (isset($n['a'][$a])) {
......@@ -261,9 +257,9 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
}
return $r;
}
/* */
function xURI($v, $base, $ns, $attr_type = '') {
if ((list($sub_r, $sub_v) = $this->xQname($v, $base, $ns)) && $sub_r) {
return array($sub_r, $sub_v);
......@@ -273,7 +269,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
}
return array($this->calcURI($v, $base), '');
}
function xQname($v, $base, $ns) {
if ($sub_r = $this->x('([a-z0-9\-\_]+)[\-\.]([a-z0-9\-\_]+)', $v)) {
if (isset($ns[$sub_r[1]])) {
......@@ -282,7 +278,7 @@ class ARC2_ErdfExtractor extends ARC2_RDFExtractor {
}
return array(0, $v);
}
/* */
}
......@@ -5,28 +5,24 @@ license: http://arc.semsol.org/license
class: ARC2 foaf:openid Extractor
author: Benjamin Nowack
version: 2007-10-08
version: 2010-11-16
*/
ARC2::inc('RDFExtractor');