I'll post this as it's fairly short. From cron run this script:
#!/bin/sh
#
# Jaakko Hyv=E4tti, Jaakko.Hyvatti@elma.fi
#
root=3D/home/www/htdocs
#
if makerobots.perl $root/robots.txt.in \
`find $root -mindepth 2 -type f -name robots.txt` \
> $root/robots.txt.tmp~
then
mv $root/robots.txt.tmp~ $root/robots.txt
fi
And this would do as makerobots.perl. Please feel free to modify
and use freely. Please improve it, this has not been in production use
anywhere, and was written this morning.
#!/usr/local/bin/perl
# Use -*- perl -*- mode
#
# Jaakko Hyv=E4tti, Jaakko.Hyvatti@elma.fi
# We open each file in turn and update an array of disallow
# directives with every robot. At the end we write the output.
#
# We check the syntax and complain to stderr. We also check for
# unauthorized lines like "Disallow: /" in robots.txts in the subdirector=
ies.
die "please specify arguments" if $#ARGV < 0;
print "# This is <URL:http://www.my.domain/robots.txt>.\n";
print "# See http://info.webcrawler.com/mak/projects/robots/norobots.html=
\n";
print "# for more information about robot exclusion standard.\n";
print "#\n";
print "# This file is automatically generated using a script\n";
print "# by Jaakko Hyv=E4tti, Jaakko.hyvatti\@elma.fi\n\n";
$root =3D "/home/www/htdocs";
$ARGV[0] =3D~ m,^(.*)/, && ($root =3D $1);
while (<>) {
s/#.*$//;
s/\s+$//;
unless ($_) {
undef $useragent;
} elsif (/^User-agent\s*:?\s*/i) {
print STDERR "$ARGV:$.:warning:missing empty line between records\n"
if defined $useragent;
if ($') {
$useragent =3D $';
} else {
undef $useragent;
print STDERR "$ARGV:$.:empty User-agent\n";
}
} elsif (/^Disallow\s*:?\s*/i) {
$path =3D $';
if (defined $useragent) {
$base =3D "/";
$rest =3D $ARGV;
$ARGV =3D~ m,^(.*/), && ($base =3D $1, $rest =3D $');
eval ("\$base =3D~ s,$root/,/,");
if ($base eq substr ($path, 0, length ($base))) {
$disallow{$useragent} =3D $disallow{$useragent} .
"Disallow: $path\t# $base$rest:$.\n";
} else {
print STDERR "$ARGV:$.:unauthorized line \"$_\"\n";
}
} else {
print STDERR "$ARGV:$.:\"$_\" outside a record\n";
}
} else {
print STDERR "$ARGV:$.:syntax error in \"$_\"\n";
}
if (eof) {
close (ARGV);
}
}
while (($useragent,$paths) =3D each (%disallow)) {
print "User-agent: $useragent\n$paths\n" unless "*" eq $useragent;
}
print "User-agent: *\n" . $disallow{"*"} . "\n" if defined $disallow{"*"}=
;
print "# eof\n";