#!/usr/local/bin/perl
# This program filters raw ls listings files to produce a file that archie
# can parse.  It started life as the filt program in the original archie
# distribution and was converted to perl and modified by Amos Shapira.
# Then significant modifications and improvements, as well as many hours of
# testing, were done by Eric Anderson at SURAnet (eanders@sura.net).
#
# Please report any bugs or send any patches to archie-admin@sura.net
# Also regular expression gurus should feel free to comment what various
# undocumented sections of the code actually does.
#
# Notes:
# Need to add to fixpermerrors a check for whatever stupid thing the site
# is sticking onto the front.
# SITES we do not handle yet:
# eba.eb.ele.tue.nl 
# harvard.harvard.edu
# biox.unibas.ch
# garbo.uwasa.fi
# inria.inria.fr
# thumper.bellcore.com
# Filter version 1.0.4
$debuglevel=0;
				# Debuglevels:
$\ = "\n";		# automatically add newline on print
@legit = ("[dl-]","[r-]","[w-]","[xsS-]","[r-]","[w-]",
	  "[xsS-]","[r-]","[w-]","[x-]",
	  "(\\d| )","(\\d| )","(\\d| )","[ \\d\\w]","[ \\d\\w]");
#for ($loop=
# Legit patterns for chars in permission part of line.

$fulllegit = join("",@legit);
$aixlegit = join("","^[DF]",@legit[1..9]);
$dirperms = join("",'d',@legit[1..9]);
$permlegit = join("","^",@legit[0..9]);
$idlegit = "^(\\w|\\d|-)+$";
#print "'$dirperms'";
$badchar = "[^ -~\\t]";
$corruptpattern = join('',$badchar,".*",$badchar,".*",$badchar,".*");
$filenamebegin= -1;     # Automagically initialized on the first directory.
$printedaline=0;
$maxdiraddsize=40;
$curdir="";
undef @basedirs;
$lastline="@@@beginning";

for (;<>;) {
    chop;
# Commented out study because it was making some stuff break, I don't
# quite know why, below is code which breaks if it is below the study.
#    print "$_";
#	print "MOOOOF" if /$dirperms/o;
#	next;
#    study;

    die ("@@@ Belch -- Corrupted input?\n") if /$corruptpattern/o;

# If any of these cases are true, the line is not printed.
    
# Remove totaling lines
    next if (/^total[ \t]+\d+[ \t]*$/o);
    next if (/^Total:[ \t]+\d+[ \t]*kbytes$/o);
# Something with opendir in it.
    next if (/^opendir:/o);# {
# Toss we are in europe lines
    next if (/WE ARE IN EUROPE/o);
# Don't print character or block devices.
    next if (/[cb][-rwxSsTt]{9}/o);
# Chuck lines having that pattern in them.
    next if (/can not access/o);
# Chuck lines as seen below.
    next if (/stale nfs file handle/io);
# Another bizarre case
    next if (/^\.:?$/o);
# Throw away leading blank lines
  next if (/^$/ && !$printedaline);
# I wonder what this does.
    next if (/^[ \t]/o);
# Throw away lines containing unreadable.
    next if (/unreadable/o);
# Throw away lines which have : no /dev/zero at the end of them
    next if (/: no \/dev\/zero$/o);
# Throw away lines which have No such file or directory in them
    next if (/No such file or directory$/o);
# Throw away lines with crt0: no /usr/lib/ld.so -- for sparta.spartacus.com
    next if (/^crt0: no \/usr\/lib\/ld.so$/o);
# Throw away short lines which aren't blank and aren't directories.
# First seen on cs.tut.fi
    next if ((length($_)<10)&&(!/:$/o)&&(!/^$/o)&&!(/^\.|\//o));
    next if ((length($_)<$filenamebegin) && /^$fulllegit/o &&
	     !(/Permission denied/o||/not found/o||/cannot access/o));
# Throw away ld.so warnings
    next if (/^ld.so: warning: /o);
# Throw away more ld.so errors
    next if (/^ld.so: map heap error \(22\) for \/dev\/zero/o);
# Throw away the line ${org}: for eba.eb.ele.tue.nl
    next if (/^\$\{org\}\:$/o);
# Throw away lines with connection timed out for ftp.informatik.rwth-aachen.de
    next if (/^ls:.*Connection timed out$/o);
# Throw away this line for aix370.rrz.uni-koeln.de
    next if (/^\.disk1\:$/o);
# Remove blank lines which precede filename entries so that enter doesn't
# think they are supposed to be directory names.
    if (/^$/o) {
	$_ = <STDIN>;
#	print STDERR "$_";		# ***
	last if !defined $_;
	chop;
	if (!/^$fulllegit/o) {
	    print "";
	    $lastline = "";
	    $_ .= " ";		# For the chop to eat.
	    redo;
	} 
    }
    
    if ($filenamebegin<0&&/^$dirperms/o) {
	$filenamebegin = length($_);
	do {
	    --$filenamebegin;
	    die ("filenamebegin dropped too much??") if $filenamebegin<20;
	} until ((substr($_,$filenamebegin,1) eq " ")&&
		 (substr($_,$filenamebegin-2,1) =~ /\d/o));
	++$filenamebegin;
    }
# Make sure we don't get stuck in a loop.
    $count=0;
    $start=$_;
# This forces idempotency. the loop that is.
    do {
	++$count;
	die("@@@ iterated for a long time on \n'$start'\n, never got done.\n") if $count>50;
#	warn("@@@ iterations:$count") if $count>2;
	$orig = $_;
# Try to put : after dir names in listing.
# bin:
# files
	
	if ((/^\./o || /^\//o) && /\w$/o  && !/Permission denied/o) {
	    print "";
	    $lastline = "";
	    $_ = "$_:";
	}
# Dump a return in if the last line was all printable chars with a colon
# on the end, e.g. a directory, and the last line was for real.
	if (!/^$fulllegit/o) {
	    print "" if (/^[\w\/+#-\.]+:$/o && $lastline);
# Remove an extra color from a directory name entry if it exists.
# For wuarchive.wustl.edu
	    s/::$/:/o;
	}

#General cleanup
# Hack out garbage people put on front of listings.
	if (/^\//o || /^\./o) {
	    s!^\./!!o;
	    s!^/usr/spool/ftp/!!o;
	    s!^/pub/!!o;
	    s!^/usr/local/pub/!!o;
	    s!^/home/ftp/pub/!!o;
	    s!^/ftp/pub/!!o;
	    s!^/com/ftp/pub/!!o;
	    s!^/var/spool/uucppublic/!!o;
	    s!^/com/ftp/sun4/pub/!!o;
	    s!^/users/ftp/!!o;
	    s!^.disk1/!!o; # For aix370.rrz.uni-koeln.de
	}

# What's this do?
#	s/^([-dl][-rwxSsTt]{9}.*)(\\$)/$1/o; 
#	s/^([-dl][-rwxSsTt]{9})(\d+)/$1 $2/o; 
# Take out trailing / from directory listing
	s/^(d.*)\/$/$1/o;
# Take :'s off the end of lines which aren't really directories names.
# Why would I want to do this? Note I still do.
# Do this so that the next line will work right.
	s/^($fulllegit.*)\:$/$1/o;
# Hack for walhalla.informatik.uni-dortmund.de, user/group names with
# spaces in them.
	s/NOT FTP/NOT_FTP/go;
# Hack for gargoyle.uchicago.edu, to fix directory with return in the name
	if (/^pub\/emwq\/Mailboxes.*h$/o) {
	    $_ .= ":";
	    $foo = <STDIN>;
	}
# Two hacks for eba.eb.ele.tue.nl
	if (/^l.*local.$/o) {
	    s/^l(.*local)./d$1/o;
	}
	if (/^pub\/apollo\/local\/News\:$/o) {
	    print "pub/apollo/local:";
	    print "drwxrwxrwx   1 news           15 Mar 15 12:00 News";
	    print "";
	}

#Put space between permissions and id.
	if (/^[ld-][r-][w-][x-]/o && /^..........\d/o) {
	    s/^(..........)/$1 /o;
	    if (substr($_,$filenamebegin-1,2) eq '  ') {
		substr($_,$filenamebegin-1,2) = ' ';
	    }
	}

#Fix AIX bogosity
	if (/$aixlegit/o) {
            # Aix ls follows directory symlinks.
	    s/ \-\> .*$//o if (/^D/o && / \-\> /o);
	    s/^D/d/o;
	    s/^F/-/o;
	}
#Special hack for earth.rs.itd.umich.edu
	if (/^mac\.bin\/\.AppleDesktop\/_:$/o && !$hack'umich_edu) {
#	    print STDERR "@Did hack for earth on $_."; # ***
	    $hack'umich_edu=1;
	    while(!/^$/o) {
		$_=<STDIN>;
	    }
	}

#Replace trailing spaces with underscores in directory listings.
	$spacepos=rindex($_," ");
	while (($spacepos>=$filenamebegin)
	      &&((/ _*$/o)||
		 (substr($_,$filenamebegin-6) =~ /^(\d| )\d(\d|\:)\d\d _* /o))) {
	    # Roughly that regexp is time (13:45) or year ( 1990)
	    s/ (_*)$/_$1/o;
	    substr($_,$filenamebegin-6) =~
		s/^(..... )(_*) /$1$2_/o;
	    $spacepos=rindex($_," ");
	}
#Put in leading spaces for bogus stuff.
#Ditto for the : terminated stuff.
# Also fixup anthing like foo/bar /doobie:
	if (/:$/o) {
	    s/ (_*(:$|\/))/_$1/o while (/ _*(:$|\/)/o);
	    s/\/(_*) /\/_$1/o while (/\/_* /o);
	}

#Complicated fixups.
	if (/Permission denied/o||/not found/o||/cannot access/o||
	    /Connection timed out/o) {
	    $_ = &fixlserrors($_);
	    next if (! $_);
	}
    } until ($orig eq $_);

    if (/^$dirperms/o) { # && length($_)<$maxdiraddsize) {
	$dirname=$curdir . substr($_,$filenamebegin);
	if (length($dirname) <$maxdiraddsize) {
	    push(@basedirs,$dirname);
#	    print STDERR "Adding '",$dirname,"' to dir list";
	}
    }
    $curdir = substr($_,0,length($_)-1)."/" if (/:$/o);

    $lastline = $_;
    print;
    $printedaline=1;
}		      
	
sub fixlserrors {
    local ($_) = @_;
    local ($first);
    local ($count);

#    print STDERR "Enter FixLsErrors";
#    return "" if /^ls.*denied$/o;
    return "" if /^(\/bin\/)?ls.*denied$/o;
#    return "" if /^ls.*not found$/o;
    return "" if /^(\/bin\/)?ls.*not found$/o;
    return "" if /^cannot access /o;
    return "" if /^lost\+found: Permission denied$/o;
#    $foodebug=1 if $_ =~ /tesol: Per/;
    $first = &fixpermline($_);
#    print STDERR "*1$first" if $foodebug;
    $count = 0;
    $_ = undef;
    while (!$_) {
#	print STDERR "Hi";
	++$count;
	die ("@@@ FixLsErrors iterated too long :$count\n") if $count>200;
	$_ = <STDIN>;
	last if !defined($_);
#	print STDERR "*a$count,$first,$_"; #***
#	print STDERR "'$_'" if defined $_;
	chop ;
	last if /^$/o;
#	print STDERR "*b$count,$first,$_"; #***
        if (/denied$/o) {
	    $_ = "";
	} else {
	    $_ = &fixpermline($_);
	}
#	print STDERR "*c$count,$first,$_"; #***
    }
#    print STDERR "*d$first" if $foodebug;
#    print STDERR "@*$first,$_" if /pleD/o;
#    print STDERR "*e$first" if $foodebug;
    $_= $first . $_;
#    print STDERR "*&&$_" if $foodebug;
    return $_ if /^$fulllegit/o;
    return $_ if /^.*:$/o;
    return "";
}
    
sub fixpermline {
    local ($_) = @_;
    local ($count);

#    print "@$_@";
# sys13 stuff for potemkin.cs.pdx.edu
    s/(\/bin\/)?ls\s*:.*denied( \(sys13\))?$//o;
    s/(\/bin\/)?ls\s*:.*not found$//o;
    s/\.\/.*not found$//o;
    s/cannot access .*$//o;
    s/\.\/.*Connection timed out:$//o;
#    print STDERR "*$_*";
    return $_ if !(/Permission denied/o || /not found/o || /cannot access/o ||
		   /Connection timed out/o);

    $rightmost=0;
    $longlen=0;
    foreach $elem (@basedirs) {
#	if ($foodebug&&rindex($_,$elem)>=0) {
#	    print STDERR "@$elem";
#	    print STDERR "@$elem,", rindex($_,$elem);
#	    print STDERR "@", rindex($_,"mac/incoming");
#	}
	if (0<=($foo=rindex($_,$elem))) {
	    $bar = length($elem);
#	    print STDERR "@@$foo,$rightmost,$bar,$longlen,", $rightmost-$bar-1;
	    if ($foo>=($rightmost-$bar-1)) {
		# Backup by at most by the length of the current one.
		# Plus a /
		$longlen=$bar;
		$rightmost=$foo;
#		print STDERR "!$_,$elem,$longlen,$rightmost" if /incoming\/pal/;
	    }
	}
    }
#    if ($rightmost>0&&$foodebug) {
#	print STDERR "Found:", substr($_,$rightmost);
#	print STDERR "Returning:" , substr($_,0,$rightmost);
#    }
    if (/$fulllegit/o) {
	$_ = substr($_,0,$rightmost);
    } else {
	return substr($_,0,$rightmost) if (($rightmost>0) && 
					  !(substr($_,$rightmost-1,1) eq "/"));
#	print STDERR "nope";
    }

    local($acc,$orig) = ("",$_);
    local(@line) = split(//o,$_);
    local($x,$m);
    local(@legitcopy) = @legit;

    $count=0;
    do {
	++$count;
	die ("@@@ fixpermline iterated too long:$count\n") if $count>50;
	$_ = shift @line;
	$m = shift @legitcopy;
#	print STDERR "#$m#$_#$acc";
	return $acc if !/$m/;	# Don't put the o here, this changes.
#	print STDERR "##$acc";
	$acc .= $_;
    } until $#legitcopy==-1;
    $_=$orig;
    s/\:\s*Permission denied//o;
    s/\/(\w|\/)*\s*not found//o;
    return $_;
}
