Commit 53c86c43 authored by Brion Vibber's avatar Brion Vibber

Bringing Sphinx search support up to code: broken out to a plugin, now...

Bringing Sphinx search support up to code: broken out to a plugin, now supports multiple sites on a single server.

Upgrade notes:
* Index names have changed from hardcoded 'Identica_people' and 'Identica_notices' to use the database name and actual table names. Must reindex.

New events:
* GetSearchEngine to override default search engine class selection from plugins

New scripts:
* gen_config.php generates a sphinx.conf from database configuration (with theoretical support for status_network table, but it doesn't seem to be cleanly queriable right now without knowing the db setup info for that. Needs generalized support.)
* Replaced old sphinx-indexer.sh and sphinx-cron.sh with index_update.php

Other fixes:
* sphinx.conf.sample better matches our live config, skipping unused stopword list and using a more realistic indexer memory limit

Further notes:
* Probably doesn't work right with PostgreSQL yet; Sphinx can pull from PG but the extraction queries currently look like they use some MySQL-specific functions.
parent 1cd6650a
......@@ -389,20 +389,16 @@ the server first.
Sphinx
------
To use a Sphinx server to search users and notices, you also need
to install, compile and enable the sphinx pecl extension for php on the
client side, which itself depends on the sphinx development files.
"pecl install sphinx" should take care of that. Add "extension=sphinx.so"
to your php.ini and reload apache to enable it.
To use a Sphinx server to search users and notices, you'll need to
enable the SphinxSearch plugin. Add to your config.php:
You can update your MySQL or Postgresql databases to drop their fulltext
search indexes, since they're now provided by sphinx.
addPlugin('SphinxSearch');
$config['sphinx']['server'] = 'searchhost.local';
On the sphinx server side, a script reads the main database and build
the keyword index. A cron job reads the database and keeps the sphinx
indexes up to date. scripts/sphinx-cron.sh should be called by cron
every 5 minutes, for example. scripts/sphinx.sh is an init.d script
to start and stop the sphinx search daemon.
You also need to install, compile and enable the sphinx pecl extension for
php on the client side, which itself depends on the sphinx development files.
See plugins/SphinxSearch/README for more details and server setup.
SMS
---
......@@ -1168,17 +1164,6 @@ base: memcached uses key-value pairs to store data. We build long,
StatusNet site using your memcached server.
port: Port to connect to; defaults to 11211.
sphinx
------
You can get a significant boost in performance using Sphinx Search
instead of your database server to search for users and notices.
<http://sphinxsearch.com/>.
enabled: Set to true to enable. Default false.
server: a string with the hostname of the sphinx server.
port: an integer with the port number of the sphinx server.
emailpost
---------
......
......@@ -104,7 +104,7 @@ class NoticesearchAction extends SearchAction
{
$notice = new Notice();
$search_engine = $notice->getSearchEngine('identica_notices');
$search_engine = $notice->getSearchEngine('notice');
$search_engine->set_sort_mode('chron');
// Ask for an extra to see if there's more.
$search_engine->limit((($page-1)*NOTICES_PER_PAGE), NOTICES_PER_PAGE + 1);
......
......@@ -62,7 +62,7 @@ class NoticesearchrssAction extends Rss10Action
$notice = new Notice();
$search_engine = $notice->getSearchEngine('identica_notices');
$search_engine = $notice->getSearchEngine('notice');
$search_engine->set_sort_mode('chron');
if (!$limit) $limit = 20;
......
......@@ -61,7 +61,7 @@ class PeoplesearchAction extends SearchAction
function showResults($q, $page)
{
$profile = new Profile();
$search_engine = $profile->getSearchEngine('identica_people');
$search_engine = $profile->getSearchEngine('profile');
$search_engine->set_sort_mode('chron');
// Ask for an extra to see if there's more.
$search_engine->limit((($page-1)*PROFILES_PER_PAGE), PROFILES_PER_PAGE + 1);
......
......@@ -161,7 +161,7 @@ class TwitapisearchatomAction extends ApiAction
// lcase it for comparison
$q = strtolower($this->query);
$search_engine = $notice->getSearchEngine('identica_notices');
$search_engine = $notice->getSearchEngine('notice');
$search_engine->set_sort_mode('chron');
$search_engine->limit(($this->page - 1) * $this->rpp,
$this->rpp + 1, true);
......
......@@ -121,7 +121,7 @@ class TwitapisearchjsonAction extends ApiAction
// lcase it for comparison
$q = strtolower($this->query);
$search_engine = $notice->getSearchEngine('identica_notices');
$search_engine = $notice->getSearchEngine('notice');
$search_engine->set_sort_mode('chron');
$search_engine->limit(($this->page - 1) * $this->rpp, $this->rpp + 1, true);
if (false === $search_engine->query($q)) {
......
......@@ -184,27 +184,20 @@ class Memcached_DataObject extends DB_DataObject
require_once INSTALLDIR.'/lib/search_engines.php';
static $search_engine;
if (!isset($search_engine)) {
$connected = false;
if (common_config('sphinx', 'enabled')) {
$search_engine = new SphinxSearch($this, $table);
$connected = $search_engine->is_connected();
}
// unable to connect to sphinx' search daemon
if (!$connected) {
if ('mysql' === common_config('db', 'type')) {
$type = common_config('search', 'type');
if ($type == 'like') {
$search_engine = new MySQLLikeSearch($this, $table);
} else if ($type == 'fulltext') {
$search_engine = new MySQLSearch($this, $table);
} else {
throw new ServerException('Unknown search type: ' . $type);
}
if (Event::handle('GetSearchEngine', array($this, $table, &$search_engine))) {
if ('mysql' === common_config('db', 'type')) {
$type = common_config('search', 'type');
if ($type == 'like') {
$search_engine = new MySQLLikeSearch($this, $table);
} else if ($type == 'fulltext') {
$search_engine = new MySQLSearch($this, $table);
} else {
$search_engine = new PGSearch($this, $table);
throw new ServerException('Unknown search type: ' . $type);
}
} else {
$search_engine = new PGSearch($this, $table);
}
}
}
return $search_engine;
}
......
......@@ -57,14 +57,16 @@ class Status_network extends DB_DataObject
$config['db']['ini_'.$dbname] = INSTALLDIR.'/classes/status_network.ini';
$config['db']['table_status_network'] = $dbname;
self::$cache = new Memcache();
if (class_exists('Memcache')) {
self::$cache = new Memcache();
if (is_array($servers)) {
foreach($servers as $server) {
self::$cache->addServer($server);
if (is_array($servers)) {
foreach($servers as $server) {
self::$cache->addServer($server);
}
} else {
self::$cache->addServer($servers);
}
} else {
self::$cache->addServer($servers);
}
self::$base = $dbname;
......@@ -76,6 +78,10 @@ class Status_network extends DB_DataObject
static function memGet($k, $v)
{
if (!self::$cache) {
return self::staticGet($k, $v);
}
$ck = self::cacheKey($k, $v);
$sn = self::$cache->get($ck);
......@@ -92,10 +98,12 @@ class Status_network extends DB_DataObject
function decache()
{
$keys = array('nickname', 'hostname', 'pathname');
foreach ($keys as $k) {
$ck = self::cacheKey($k, $this->$k);
self::$cache->delete($ck);
if (self::$cache) {
$keys = array('nickname', 'hostname', 'pathname');
foreach ($keys as $k) {
$ck = self::cacheKey($k, $this->$k);
self::$cache->delete($ck);
}
}
}
......
......@@ -125,10 +125,6 @@ $default =
'public' => array()), # JIDs of users who want to receive the public stream
'invite' =>
array('enabled' => true),
'sphinx' =>
array('enabled' => false,
'server' => 'localhost',
'port' => 3312),
'tag' =>
array('dropoff' => 864000.0),
'popular' =>
......
......@@ -46,70 +46,11 @@ class SearchEngine
}
}
class SphinxSearch extends SearchEngine
{
private $sphinx;
private $connected;
function __construct($target, $table)
{
$fp = @fsockopen(common_config('sphinx', 'server'), common_config('sphinx', 'port'));
if (!$fp) {
$this->connected = false;
return;
}
fclose($fp);
parent::__construct($target, $table);
$this->sphinx = new SphinxClient;
$this->sphinx->setServer(common_config('sphinx', 'server'), common_config('sphinx', 'port'));
$this->connected = true;
}
function is_connected()
{
return $this->connected;
}
function limit($offset, $count, $rss = false)
{
//FIXME without LARGEST_POSSIBLE, the most recent results aren't returned
// this probably has a large impact on performance
$LARGEST_POSSIBLE = 1e6;
if ($rss) {
$this->sphinx->setLimits($offset, $count, $count, $LARGEST_POSSIBLE);
}
else {
// return at most 50 pages of results
$this->sphinx->setLimits($offset, $count, 50 * ($count - 1), $LARGEST_POSSIBLE);
}
return $this->target->limit(0, $count);
}
function query($q)
{
$result = $this->sphinx->query($q, $this->table);
if (!isset($result['matches'])) return false;
$id_set = join(', ', array_keys($result['matches']));
$this->target->whereAdd("id in ($id_set)");
return true;
}
function set_sort_mode($mode)
{
if ('chron' === $mode) {
$this->sphinx->SetSortMode(SPH_SORT_ATTR_DESC, 'created_ts');
return $this->target->orderBy('created desc');
}
}
}
class MySQLSearch extends SearchEngine
{
function query($q)
{
if ('identica_people' === $this->table) {
if ('profile' === $this->table) {
$this->target->whereAdd('MATCH(nickname, fullname, location, bio, homepage) ' .
'AGAINST (\''.addslashes($q).'\' IN BOOLEAN MODE)');
if (strtolower($q) != $q) {
......@@ -117,7 +58,7 @@ class MySQLSearch extends SearchEngine
'AGAINST (\''.addslashes(strtolower($q)).'\' IN BOOLEAN MODE)', 'OR');
}
return true;
} else if ('identica_notices' === $this->table) {
} else if ('notice' === $this->table) {
// Don't show imported notices
$this->target->whereAdd('notice.is_local != ' . Notice::GATEWAY);
......@@ -143,13 +84,13 @@ class MySQLLikeSearch extends SearchEngine
{
function query($q)
{
if ('identica_people' === $this->table) {
if ('profile' === $this->table) {
$qry = sprintf('(nickname LIKE "%%%1$s%%" OR '.
' fullname LIKE "%%%1$s%%" OR '.
' location LIKE "%%%1$s%%" OR '.
' bio LIKE "%%%1$s%%" OR '.
' homepage LIKE "%%%1$s%%")', addslashes($q));
} else if ('identica_notices' === $this->table) {
} else if ('notice' === $this->table) {
$qry = sprintf('content LIKE "%%%1$s%%"', addslashes($q));
} else {
throw new ServerException('Unknown table: ' . $this->table);
......@@ -165,9 +106,9 @@ class PGSearch extends SearchEngine
{
function query($q)
{
if ('identica_people' === $this->table) {
if ('profile' === $this->table) {
return $this->target->whereAdd('textsearch @@ plainto_tsquery(\''.addslashes($q).'\')');
} else if ('identica_notices' === $this->table) {
} else if ('notice' === $this->table) {
// XXX: We need to filter out gateway notices (notice.is_local = -2) --Zach
......
You can get a significant boost in performance using Sphinx Search
instead of your database server to search for users and notices.
<http://sphinxsearch.com/>.
Configuration
-------------
In StatusNet's configuration, you can adjust the following settings
under 'sphinx':
enabled: Set to true to enable. Default false.
server: a string with the hostname of the sphinx server.
port: an integer with the port number of the sphinx server.
Requirements
------------
To use a Sphinx server to search users and notices, you also need
to install, compile and enable the sphinx pecl extension for php on the
client side, which itself depends on the sphinx development files.
"pecl install sphinx" should take care of that. Add "extension=sphinx.so"
to your php.ini and reload apache to enable it.
You can update your MySQL or Postgresql databases to drop their fulltext
search indexes, since they're now provided by sphinx.
You will also need a Sphinx server to serve the search queries.
On the sphinx server side, a script reads the main database and build
the keyword index. A cron job reads the database and keeps the sphinx
indexes up to date. scripts/sphinx-cron.sh should be called by cron
every 5 minutes, for example. scripts/sphinx.sh is an init.d script
to start and stop the sphinx search daemon.
Server configuration
--------------------
scripts/gen_config.php can generate a sphinx.conf file listing MySQL
data sources for your databases. You may need to tweak paths afterwards.
$ plugins/SphinxSearch/scripts/gen_config.php > sphinx.conf
If you wish, you can build a full config yourself based on sphinx.conf.sample
<?php
/**
* StatusNet, the distributed open-source microblogging tool
*
* PHP version 5
*
* LICENCE: This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @category Plugin
* @package StatusNet
* @author Brion Vibber <brion@status.net>
* @copyright 2009 Control Yourself, Inc.
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html GNU Affero General Public License version 3.0
* @link http://laconi.ca/
*/
if (!defined('STATUSNET')) {
exit(1);
}
// Set defaults if not already set in the config array...
global $config;
$sphinxDefaults =
array('enabled' => true,
'server' => 'localhost',
'port' => 3312);
foreach($sphinxDefaults as $key => $val) {
if (!isset($config['sphinx'][$key])) {
$config['sphinx'][$key] = $val;
}
}
/**
* Plugin for Sphinx search backend.
*
* @category Plugin
* @package StatusNet
* @author Brion Vibber <brion@status.net>
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html GNU Affero General Public License version 3.0
* @link http://laconi.ca/
* @link http://twitter.com/
*/
class SphinxSearchPlugin extends Plugin
{
/**
* Automatically load any classes used
*
* @param string $cls the class
* @return boolean hook return
*/
function onAutoload($cls)
{
switch ($cls) {
case 'SphinxSearch':
include_once INSTALLDIR . '/plugins/SphinxSearch/' .
strtolower($cls) . '.php';
return false;
default:
return true;
}
}
/**
* Create sphinx search engine object for the given table type.
*
* @param Memcached_DataObject $target
* @param string $table
* @param out &$search_engine SearchEngine object on output if successful
* @ return boolean hook return
*/
function onGetSearchEngine(Memcached_DataObject $target, $table, &$search_engine)
{
if (common_config('sphinx', 'enabled')) {
if (!class_exists('SphinxClient')) {
throw new ServerException('Sphinx PHP extension must be installed.');
}
$engine = new SphinxSearch($target, $table);
if ($engine->is_connected()) {
$search_engine = $engine;
return false;
}
}
// Sphinx disabled or disconnected
return true;
}
}
#!/usr/bin/env php
<?php
/*
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2009, StatusNet, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('INSTALLDIR', realpath(dirname(__FILE__) . '/../../..'));
$longoptions = array('base=', 'network');
$helptext = <<<END_OF_TRIM_HELP
Generates sphinx.conf file based on StatusNet configuration.
--base Base dir to Sphinx install
(default /usr/local)
--network Use status_network global config table
(non-functional at present)
END_OF_TRIM_HELP;
require_once INSTALLDIR . '/scripts/commandline.inc';
require dirname(__FILE__) . '/sphinx-utils.php';
$timestamp = date('r');
print <<<END
#
# Sphinx configuration for StatusNet
# Generated {$timestamp}
#
END;
sphinx_iterate_sites('sphinx_site_template');
print <<<END
indexer
{
mem_limit = 300M
}
searchd
{
port = 3312
log = {$base}/log/searchd.log
query_log = {$base}/log/query.log
read_timeout = 5
max_children = 30
pid_file = {$base}/log/searchd.pid
max_matches = 1000
seamless_rotate = 1
preopen_indexes = 0
unlink_old = 1
}
END;
/**
* Build config entries for a single site
* @fixme we only seem to have master DB currently available...
*/
function sphinx_site_template($sn)
{
return
sphinx_template($sn,
'profile',
'SELECT id, UNIX_TIMESTAMP(created) as created_ts, nickname, fullname, location, bio, homepage FROM profile',
'SELECT * FROM profile where id = $id') .
sphinx_template($sn,
'notice',
'SELECT id, UNIX_TIMESTAMP(created) as created_ts, content FROM notice',
'SELECT * FROM notice where notice.id = $id AND notice.is_local != -2');
}
function sphinx_template($sn, $table, $query, $query_info)
{
$base = sphinx_base();
$dbtype = common_config('db', 'type');
print <<<END
#
# {$sn->sitename}
#
source {$sn->dbname}_src_{$table}
{
type = {$dbtype}
sql_host = {$sn->dbhost}
sql_user = {$sn->dbuser}
sql_pass = {$sn->dbpass}
sql_db = {$sn->dbname}
sql_query_pre = SET NAMES utf8;
sql_query = {$query}
sql_query_info = {$query_info}
sql_attr_timestamp = created_ts
}
index {$sn->dbname}_{$table}
{
source = {$sn->dbname}_src_{$table}
path = {$base}/data/{$sn->dbname}_{$table}
docinfo = extern
charset_type = utf-8
min_word_len = 3
}
END;
}
#!/usr/bin/env php
<?php
/*
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2009, StatusNet, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
define('INSTALLDIR', realpath(dirname(__FILE__) . '/../../..'));
$longoptions = array('base=', 'network');
$helptext = <<<END_OF_TRIM_HELP
Runs Sphinx search indexer.
--rotate Have Sphinx run index update in background and
rotate updated indexes into place as they finish.
--base Base dir to Sphinx install
(default /usr/local)
--network Use status_network global config table for site list
(non-functional at present)
END_OF_TRIM_HELP;
require_once INSTALLDIR . '/scripts/commandline.inc';
require dirname(__FILE__) . '/sphinx-utils.php';
sphinx_iterate_sites('sphinx_index_update');
function sphinx_index_update($sn)
{
$base = sphinx_base();
$baseIndexes = array('notice', 'profile');
$params = array();
if (have_option('rotate')) {
$params[] = '--rotate';
}
foreach ($baseIndexes as $index) {
$params[] = "{$sn->dbname}_{$index}";
}
$params = implode(' ', $params);
$cmd = "$base/bin/indexer --config $base/etc/sphinx.conf $params";
print "$cmd\n";
system($cmd);
}
<?php
/*
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2009, StatusNet, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
function sphinx_use_network()
{
return have_option('network');
}
function sphinx_base()
{
if (have_option('base')) {
return get_option_value('base');
} else {
return "/usr/local/sphinx";
}
}
function sphinx_iterate_sites($callback)
{
if (sphinx_use_network()) {
// @fixme this should use, like, some kind of config
Status_network::setupDB('localhost', 'statusnet', 'statuspass', 'statusnet');
$sn = new Status_network();
if (!$sn->find()) {
die("Confused... no sites in status_network table or lookup failed.\n");
}
while ($sn->fetch()) {
$callback($sn);
}
} else {
if (preg_match('!^(mysqli?|pgsql)://(.*?):(.*?)@(.*?)/(.*?)$!',
common_config('db', 'database'), $matches)) {
list(/*all*/, $dbtype, $dbuser, $dbpass, $dbhost, $dbname) = $matches;
$sn = (object)array(
'sitename' => common_config('site', 'name'),
'dbhost' => $dbhost,
'dbuser' => $dbuser,
'dbpass' => $dbpass,
'dbname' => $dbname);
$callback($sn);
} else {
print "Unrecognized database configuration string in config.php\n";
exit(1);
}
}
}
<?php
/*
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2008, 2009, StatusNet, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
if (!defined('STATUSNET')) {
exit(1);
}