sitemap.php 11.9 KB
Newer Older
1
#!/usr/bin/env php
2
<?php
3
/*
4
 * StatusNet - the distributed open-source microblogging tool
5
 * Copyright (C) 2008, 2009, StatusNet, Inc.
6 7 8 9 10 11 12 13 14 15 16 17 18 19
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
20

21
define('INSTALLDIR', realpath(dirname(__FILE__) . '/..'));
22

23 24 25 26 27 28 29 30 31 32 33 34
$shortoptions = 'f:d:u:';

$helptext = <<<END_OF_SITEMAP_HELP
Script for creating sitemaps files per http://sitemaps.org/

    -f <indexfile>   Use <indexfile> as output file
    -d <outputdir>   Use <outputdir> for new sitemaps
    -u <outputurl>   Use <outputurl> as root for URLs

END_OF_SITEMAP_HELP;

require_once INSTALLDIR . '/scripts/commandline.inc';
35 36 37 38 39 40 41 42

$output_paths = parse_args();

standard_map();
notices_map();
user_map();
index_map();

43 44 45
// ------------------------------------------------------------------------------
// Main functions: get data out and turn them into sitemaps
// ------------------------------------------------------------------------------
46

47
// Generate index sitemap of all other sitemaps.
48 49
function index_map()
{
50 51 52
    global $output_paths;
    $output_dir = $output_paths['output_dir'];
    $output_url = $output_paths['output_url'];
53

54
    foreach (glob("$output_dir*.xml") as $file_name) {
55

56
        // Just the file name please.
57
        $file_name = preg_replace("|$output_dir|", '', $file_name);
58

59 60 61 62 63 64 65
        $index_urls .= sitemap(
                           array(
                                 'url' => $output_url . $file_name,
                                 'changefreq' => 'daily'
                                 )
                           );
    }
66

67
    write_file($output_paths['index_file'], sitemapindex($index_urls));
68 69
}

70
// Generate sitemap of standard site elements.
71 72
function standard_map()
{
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
    global $output_paths;

    $standard_map_urls .= url(
                              array(
                                    'url' => common_local_url('public'),
                                    'changefreq' => 'daily',
                                    'priority' => '1',
                                    )
                              );

    $standard_map_urls .= url(
                              array(
                                    'url' => common_local_url('publicrss'),
                                    'changefreq' => 'daily',
                                    'priority' => '0.3',
                                    )
                              );

91
    $docs = array('about', 'faq', 'contact', 'im', 'openid', 'openmublog',
92
        'privacy', 'source', 'badge');
93 94 95 96 97 98 99 100 101 102 103 104 105 106

    foreach($docs as $title) {
        $standard_map_urls .= url(
                                  array(
                                        'url' => common_local_url('doc', array('title' => $title)),
                                        'changefreq' => 'monthly',
                                        'priority'   => '0.2',
                                        )
                                  );
    }

    $urlset_path = $output_paths['output_dir'] . 'standard.xml';

    write_file($urlset_path, urlset($standard_map_urls));
107 108
}

109
// Generate sitemaps of all notices.
110 111
function notices_map()
{
112
    global $output_paths;
113

114
    $notices = DB_DataObject::factory('notice');
115

116
    $notices->query('SELECT id, uri, url, modified FROM notice where is_local = 1');
117

118 119
    $notice_count = 0;
    $map_count = 1;
120

121
    while ($notices->fetch()) {
122

123
        // Maximum 50,000 URLs per sitemap file.
124 125 126 127
        if ($notice_count == 50000) {
            $notice_count = 0;
            $map_count++;
        }
128

129 130
        // remote notices have an URL

131 132 133 134 135 136 137
        if (!$notices->url && $notices->uri) {
            $notice = array(
                        'url'        => ($notices->uri) ? $notices->uri : common_local_url('shownotice', array('notice' => $notices->id)),
                        'lastmod'    => common_date_w3dtf($notices->modified),
                        'changefreq' => 'never',
                        'priority'   => '1',
                        );
138

139 140 141 142
            $notice_list[$map_count] .= url($notice);
            $notice_count++;
        }
    }
143

144
    // Make full sitemaps from the lists and save them.
145
    array_to_map($notice_list, 'notice');
146 147
}

148
// Generate sitemaps of all users.
149 150
function user_map()
{
151 152 153 154 155 156 157 158 159 160 161
    global $output_paths;

    $users = DB_DataObject::factory('user');

    $users->query('SELECT id, nickname FROM user');

    $user_count = 0;
    $map_count = 1;

    while ($users->fetch()) {

162
        // Maximum 50,000 URLs per sitemap file.
163 164 165 166 167 168 169
        if ($user_count == 50000) {
            $user_count = 0;
            $map_count++;
        }

        $user_args = array('nickname' => $users->nickname);

170
        // Define parameters for generating <url></url> elements.
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
        $user = array(
                      'url'        => common_local_url('showstream', $user_args),
                      'changefreq' => 'daily',
                      'priority'   => '1',
                      );

        $user_rss = array(
                          'url'        => common_local_url('userrss', $user_args),
                          'changefreq' => 'daily',
                          'priority'   => '0.3',
                          );

        $all = array(
                     'url'        => common_local_url('all', $user_args),
                     'changefreq' => 'daily',
                     'priority'   => '1',
                     );

        $all_rss = array(
                         'url'        => common_local_url('allrss', $user_args),
                         'changefreq' => 'daily',
                         'priority'   => '0.3',
                         );

        $replies = array(
                         'url'        => common_local_url('replies', $user_args),
                         'changefreq' => 'daily',
                         'priority'   => '1',
                         );

        $replies_rss = array(
                             'url'        => common_local_url('repliesrss', $user_args),
                             'changefreq' => 'daily',
                             'priority'   => '0.3',
                             );

        $foaf = array(
                      'url'        => common_local_url('foaf', $user_args),
                      'changefreq' => 'weekly',
                      'priority'   => '0.5',
                      );

213 214
        // Construct a <url></url> element for each user facet and add it
        // to our existing list of those.
215 216 217 218 219 220 221 222 223 224 225
        $user_list[$map_count]        .= url($user);
        $user_rss_list[$map_count]    .= url($user_rss);
        $all_list[$map_count]         .= url($all);
        $all_rss_list[$map_count]     .= url($all_rss);
        $replies_list[$map_count]     .= url($replies);
        $replies_rss_list[$map_count] .= url($replies_rss);
        $foaf_list[$map_count]        .= url($foaf);

        $user_count++;
    }

226 227 228
    // Make full sitemaps from the lists and save them.
    // Possible factoring: put all the lists into a master array, thus allowing
    // calling with single argument (i.e., array_to_map('user')).
229 230 231 232 233 234 235
    array_to_map($user_list, 'user');
    array_to_map($user_rss_list, 'user_rss');
    array_to_map($all_list, 'all');
    array_to_map($all_rss_list, 'all_rss');
    array_to_map($replies_list, 'replies');
    array_to_map($replies_rss_list, 'replies_rss');
    array_to_map($foaf_list, 'foaf');
236 237
}

238 239 240
// ------------------------------------------------------------------------------
// XML generation functions
// ------------------------------------------------------------------------------
241

242
// Generate a <url></url> element.
243 244
function url($url_args)
{
245
    $url        = preg_replace('/&/', '&amp;', $url_args['url']); // escape ampersands for XML
246 247 248
    $lastmod    = $url_args['lastmod'];
    $changefreq = $url_args['changefreq'];
    $priority   = $url_args['priority'];
249

250 251 252
    if (is_null($url)) {
        error("url() arguments require 'url' value.");
    }
253

254 255
    $url_out = "\t<url>\n";
    $url_out .= "\t\t<loc>$url</loc>\n";
256

257 258 259
    if ($changefreq) {
        $url_out .= "\t\t<changefreq>$changefreq</changefreq>\n";
    }
260

261 262 263
    if ($lastmod) {
        $url_out .= "\t\t<lastmod>$lastmod</lastmod>\n";
    }
264

265 266 267
    if ($priority) {
        $url_out .= "\t\t<priority>$priority</priority>\n";
    }
268

269
    $url_out .= "\t</url>\n";
270

271
    return $url_out;
272 273
}

274 275
function sitemap($sitemap_args)
{
276
    $url        = preg_replace('/&/', '&amp;', $sitemap_args['url']); // escape ampersands for XML
277
    $lastmod    = $sitemap_args['lastmod'];
Evan Prodromou's avatar
Evan Prodromou committed
278

279 280 281
    if (is_null($url)) {
        error("url() arguments require 'url' value.");
    }
Evan Prodromou's avatar
Evan Prodromou committed
282

283 284
    $sitemap_out = "\t<sitemap>\n";
    $sitemap_out .= "\t\t<loc>$url</loc>\n";
Evan Prodromou's avatar
Evan Prodromou committed
285

286 287 288
    if ($lastmod) {
        $sitemap_out .= "\t\t<lastmod>$lastmod</lastmod>\n";
    }
Evan Prodromou's avatar
Evan Prodromou committed
289

290
    $sitemap_out .= "\t</sitemap>\n";
Evan Prodromou's avatar
Evan Prodromou committed
291

292
    return $sitemap_out;
Evan Prodromou's avatar
Evan Prodromou committed
293 294
}

295
// Generate a <urlset></urlset> element.
296 297
function urlset($urlset_text)
{
298 299 300 301
    $urlset = '<?xml version="1.0" encoding="UTF-8"?>' . "\n" .
      '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n" .
      $urlset_text .
      '</urlset>';
302

303
    return $urlset;
304 305
}

306
// Generate a <urlset></urlset> element.
307 308
function sitemapindex($sitemapindex_text)
{
309 310 311 312
    $sitemapindex = '<?xml version="1.0" encoding="UTF-8"?>' . "\n" .
      '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' . "\n" .
      $sitemapindex_text .
      '</sitemapindex>';
Evan Prodromou's avatar
Evan Prodromou committed
313

314
    return $sitemapindex;
Evan Prodromou's avatar
Evan Prodromou committed
315 316
}

317
// Generate a sitemap from an array containing <url></url> elements and write it to a file.
318 319
function array_to_map($url_list, $filename_prefix)
{
320 321 322
    global $output_paths;

    if ($url_list) {
323
        // $map_urls is a long string containing concatenated <url></url> elements.
324 325
        while (list($map_idx, $map_urls) = each($url_list)) {
            $urlset_path = $output_paths['output_dir'] . "$filename_prefix-$map_idx.xml";
326

327 328 329
            write_file($urlset_path, urlset($map_urls));
        }
    }
330 331
}

332 333 334
// ------------------------------------------------------------------------------
// Internal functions
// ------------------------------------------------------------------------------
335

336
// Parse command line arguments.
337 338
function parse_args()
{
339 340 341
    $index_file = get_option_value('f');
    $output_dir = get_option_value('d');
    $output_url = get_option_value('u');
342 343

    if (file_exists($output_dir)) {
Evan Prodromou's avatar
TRUE  
Evan Prodromou committed
344
        if (is_writable($output_dir) === false) {
345 346 347 348 349 350 351 352 353 354 355 356 357
            error("$output_dir is not writable.");
        }
    }     else {
        error("output directory $output_dir does not exist.");
    }

    $paths = array(
                   'index_file' => $index_file,
                   'output_dir' => trailing_slash($output_dir),
                   'output_url' => trailing_slash($output_url),
                   );

    return $paths;
358 359
}

360
// Ensure paths end with a "/".
361 362
function trailing_slash($path)
{
363 364 365
    if (preg_match('/\/$/', $path) == 0) {
        $path .= '/';
    }
366

367
    return $path;
368 369
}

370
// Write data to disk.
371 372
function write_file($path, $data)
{
373 374 375 376 377 378
    if (is_null($path)) {
        error('No path specified for writing to.');
    }     elseif (is_null($data)) {
        error('No data specified for writing.');
    }

Evan Prodromou's avatar
TRUE  
Evan Prodromou committed
379
    if (($fh_out = fopen($path,'w')) === false) {
380 381 382
        error("couldn't open $path for writing.");
    }

Evan Prodromou's avatar
TRUE  
Evan Prodromou committed
383
    if (fwrite($fh_out, $data) === false) {
384 385
        error("couldn't write to $path.");
    }
386 387
}

388
// Display an error message and exit.
389 390
function error ($error_msg)
{
391 392 393
    if (is_null($error_msg)) {
        $error_msg = 'error() was called without any explanation!';
    }
394

395 396
    echo "Error: $error_msg\n";
    exit(1);
397 398 399
}

?>