#!/usr/public/bin/perl # $Id: momspider,v 1.8 1994/08/10 10:18:29 fielding Exp $ # --------------------------------------------------------------------------- # Copyright (c) 1994 Regents of the University of California. # All rights reserved. # # MOMspider -- A World-Wide Web spider for multi-owner maintenance of # distributed hypertext infostructures. # # This software has been developed by Roy Fielding as # part of the Arcadia project at the University of California, Irvine. # See the file README.html for distribution info and pointers to documentation. # See the file docs/INSTALL.txt for installation instructions. # See the file MOM_Changes.pl for known problems and version information. # See below for usage information. # # The latest version of MOMspider can always be obtained from # # or # # If you have any suggestions, bug reports, fixes, or enhancements, # send them to the author Roy Fielding at . # # Redistribution and use in source and binary forms are permitted, # subject to the restriction noted below, provided that the above # copyright notice and this paragraph and the following paragraphs are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed in part by the University of # California, Irvine. The name of the University may not be used to # endorse or promote products derived from this software without # specific prior written permission. THIS SOFTWARE IS PROVIDED ``AS IS'' # AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT # LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE. # # Use of this software in any way or in any form, source or binary, # is not allowed in any country which prohibits disclaimers of any # implied warranties of merchantability or fitness for a particular # purpose or any disclaimers of a similar nature. # # IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY # FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES # ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION # (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY # OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # --------------------------------------------------------------------------- umask(022); # Allow this process to produce world-readable files $WWWlib = ($ENV{'LIBWWW_PERL'} || '.'); $MOMlib = ($ENV{'MOMSPIDER_HOME'} || '.'); if ($WWWlib ne '.') { unshift(@INC, $WWWlib); } if ($MOMlib ne '.') { unshift(@INC, $MOMlib); } require "MOM_Changes.pl"; require "getopts.pl"; require "www.pl"; require "momconfig.pl"; require "momhistory.pl"; require "momevent.pl"; require "momavoid.pl"; require "momvisit.pl"; # ========================================================================== # Get the default configuration options from momconfig.pl $InstructFile = ($momconfig'InstructFile || ".momspider-instruct"); $SystemAvoid = ($momconfig'SystemAvoid || "system-avoid"); $SystemSites = ($momconfig'SystemSites || "system-sites"); $AvoidFile = ($momconfig'AvoidFile || ".momspider-avoid"); $SitesFile = ($momconfig'SitesFile || ".momspider-sites"); $MaxDepth = ($momconfig'MaxDepth || 20); # ========================================================================== # ========================================================================== # Print the usage information if help requested (-h) or a bad option given. # sub usage { die <<"EndUsage"; usage: momspider [-h] [-e errorfile] [-o outfile] [-i instructfile] [-d maxdepth] [-a avoidfile] [-s sitesfile] [-A system_avoidfile] [-S system_sitesfile] $Version WWW Spider for multi-owner maintenance of distributed hypertext infostructures. Options: [DEFAULT] -h Help -- just display this message and quit. -e Append error history to the following file. [STDERR] -o Append output history to the following file. [STDOUT] -i Get your instructions from the following file. [$InstructFile] -d Maximum traversal depth. [$MaxDepth] -a Read/write the user's URLs to avoid into the following file. [$AvoidFile] -s Read/write the user's sites visited into the following file. [$SitesFile] -A Read the systemwide URLs to avoid from the following file. [$SystemAvoid] -S Read the systemwide sites visited from the following file. [$SystemSites] EndUsage } # ========================================================================== # Get the command-line options if (!(&Getopts('hi:o:e:d:a:s:A:S:')) || $opt_h) { &usage; } if ($opt_e) { close(STDERR); open (STDERR,">> $opt_e") || die "Error opening err file: $!, stopped"; } select((select(STDERR), $| = 1)[0]); # Make STDERR unbuffered if ($opt_o) { if (-e $opt_o) { rename($opt_o, "$opt_o.bak"); } close(STDOUT); open (STDOUT,"> $opt_o") || die "Error opening log file: $!, stopped"; } $| = 1; # Make STDOUT unbuffered if ($opt_d) { $MaxDepth = $opt_d; } if ($opt_i) { $InstructFile = $opt_i; } if ($opt_a) { $AvoidFile = $opt_a; } if ($opt_s) { $SitesFile = $opt_s; } if ($opt_A) { $SystemAvoid = $opt_A; } if ($opt_S) { $SystemSites = $opt_S; } # ========================================================================== # Start working -- load the instructions and the avoid files &momevent'begin_program($opt_e); &read_instruct; # Get the instructions for each task &www'set_def_header('http', 'User-Agent', $Version); &momvisit'setMaxDepth($MaxDepth); &momavoid'load($SystemAvoid, $SystemSites, 'R'); # Load systemwide avoids &momavoid'load($AvoidFile, $SitesFile, 'W'); # Load user's avoids &momhistory'remember_tops(*TaskTopURL, *TaskIndexURL); $task = 1; while ($task <= $#TaskType) { next unless $TaskType[$task]; foreach $exurl (split(/#/, $TaskExclude[$task])) { &momavoid'exclude($exurl); } &momvisit'infostruct($task, $TaskType[$task], $TaskName[$task], $TaskTopURL[$task]); &momavoid'clear_excludes; } continue { $task++; } &momavoid'save($AvoidFile, $SitesFile); # Write user's avoid and sites files &momevent'end_program($opt_e); exit(0); # ========================================================================== # ========================================================================== # read_instruct(): Read the task instructions from the InstructFile # sub read_instruct { local($task, $intask, $innum, $type, $reason); print "Reading instructions from $InstructFile\n"; if (!open(INSTRUCT, $InstructFile)) { print STDERR "Cannot open the instruction file: $!\n"; &usage; } @TaskType = (); # Required task type = ('Owner','Tree','Site'). @TaskName = (); # Required task name for descriptive use. @TaskTopURL = (); # Required WWW URL for the starting top document. @TaskIndexURL = (); # Required WWW URL for the index document. @TaskIndexFile = (); # Required full Unix pathname to index file. @TaskIndexTitle = (); # Optional title string for created index file. @TaskEmailAddress = (); # Optional email address to send alerts. @TaskEmailBroken = (); # Optional -- send alert if a link is broken? @TaskEmailChanged = (); # Optional -- send alert if modified within X days @TaskEmailExpired = (); # Optional -- send alert if expires within X days @TaskEmailRedirected = (); # Optional -- send alert if a link is redirected @TaskChangeWindow = (); # Optional days a change is still interesting @TaskExpireWindow = (); # Optional days before expiring is interesting @TaskExclude = (); # Optional URLs to exclude (leaf) from this task $task = 0; $intask = 0; $innum = 0; while () { next if ( /^$/ || /^\#/ ); # Ignore blank and comment lines if (!$intask) # We are not within a task, { $innum++; if ( /^<(\w+)\b/ ) # Start of next task? { $type = $1; if ($momconfig'Allowed{$type}) { $task++; $TaskType[$task] = $type; $TaskExclude[$task] = ''; $intask = 1; } else { print STDERR "Instruction type $type is not allowed"; print STDERR ", skipping instruction $innum\n"; while () { last if /^>$/; } } } elsif ( /^SystemAvoid\s+(\S+)\s/ ) { if (!$opt_A) { $SystemAvoid = $1; } } elsif ( /^SystemSites\s+(\S+)\s/ ) { if (!$opt_S) { $SystemSites = $1; } } elsif ( /^AvoidFile\s+(\S+)\s/ ) { if (!$opt_a) { $AvoidFile = $1; } } elsif ( /^SitesFile\s+(\S+)\s/ ) { if (!$opt_s) { $SitesFile = $1; } } elsif ( /^SitesCheck\s+(\d+)\s/ ) { &momavoid'setCheckInterval($1); } elsif ( /^ReplyTo\s+(\S.*)/ ) { &www'set_def_header('http','From',$1); } elsif ( /^MaxDepth\s+(\d+)\s/ ) { if (!$opt_d) { $MaxDepth = $1; } } else { print STDERR "Unrecognized instruction $innum at line $.\n"; print STDERR " of $InstructFile\n"; } } else # We are currently within a task { if ( /^>\s*$/ ) # Line indicates End of Task? { $reason = ''; if (! $TaskType[$task]) # Check for missing requirements { $reason = "has no task Type"; } elsif (! $TaskName[$task]) { $reason = "has no Name"; } elsif (! $TaskTopURL[$task]) { $reason = "has no TopURL"; } elsif (! $TaskIndexURL[$task] ) { $reason = "has no IndexURL"; } elsif (! $TaskIndexFile[$task]) { $reason = "has no IndexFile"; } elsif ((! $TaskEmailAddress[$task]) && ($TaskEmailBroken[$task] || $TaskEmailChanged[$task] || $TaskEmailExpired[$task] || $TaskEmailRedirected[$task] )) { $reason = "has no EmailAddress"; } if ($reason) # If a task requirement was not met, { # then undo its task options. print(STDERR "Instruction $innum ", $reason, ", skipping it\n"); undef $TaskType[$task]; undef $TaskName[$task]; undef $TaskTopURL[$task]; undef $TaskIndexURL[$task]; undef $TaskIndexFile[$task]; undef $TaskIndexTitle[$task]; undef $TaskEmailAddress[$task]; undef $TaskEmailBroken[$task]; undef $TaskEmailChanged[$task]; undef $TaskEmailExpired[$task]; undef $TaskEmailRedirected[$task]; undef $TaskChangeWindow[$task]; undef $TaskExpireWindow[$task]; undef $TaskExclude[$task]; $task--; } else # Fill in the defaults if needed { if (! $TaskIndexTitle[$task]) { $TaskIndexTitle[$task] = "MOMspider Index for " . $TaskName[$task]; } if (!defined($TaskChangeWindow[$task])) { $TaskChangeWindow[$task] = 7; } if (!defined($TaskExpireWindow[$task])) { $TaskExpireWindow[$task] = 0; } $TaskExclude[$task] .= $TaskIndexURL[$task]; } $intask = 0; } elsif ( /^