Re: [Corpora-List] Extracting only editorial content from a HTML page

From: Vlado Keselj (vlado@cs.dal.ca)
Date: Thu Aug 11 2005 - 00:11:32 MET DST

  • Next message: InuH: "[Corpora-List] KILO"

    On Wed, 10 Aug 2005, Vlado Keselj wrote:

    >
    > This is becoming a *really* long thread, but still I am tempted to add
    > my $.02.
    >
    > I use a Perl script which grabs a web page, does some pre-processing,
    > reports new pieces using diff command, with some post-processing.
    > The algorithm is as follows:
    > 1. get webpage (for this one can use wget, lynx, or some other way)
    > 2. pre-processing (usually one wants to remove tags, but not necessarily;
    > e.g. lynx -dump, Tidy, or clean_html)
    > 3. if there is previous page version then
    > 4. | diff this with old capturing new stuff
    > 5. save this page to old
    > 6. if there was a diff then webpage is only new stuff
    > 7. post-processing
    >
    > Step 2 may become very interesting. Diff is very good, but still it
    > depends on physical lines which are not always defined in an ideal way, so
    > you may want to "reshape" them in step 2.
    >
    > If a page dramatically changes, one gets a burst of noise, but the
    > "extractor" self-stabilizes with no just wonderfully. I use it as
    > page-watch, run it as a cron-job, and mail any diffs.
    >
    > If anybody is interested I can send/post my Perl script (after some
    > clean-up).
    >
    > --Vlado

    Appended. --Vlado

    #!/usr/bin/perl
    # www.cs.dal.ca/~vlado/srcperl/report-new.pl
    # (c) 2000-2005 Vlado Keselj

    sub help { print <<"#EOT" }
    # Report new material on a web page, version $VERSION
    #
    # Uses diff, lynx, sendmail (if option -e is used)
    #
    # Usage: report-new.pl [switches] URL
    # -h Print help and exit.
    # -v Print version of the program and exit.
    # -e email Sends output, if not empty, to email.
    #EOT

    use strict;
    use POSIX qw(strftime);

    use vars qw( $VERSION );
    $VERSION = sprintf "%d.%d", q$Revision: 1.1 $ =~ /(\d+)/g;

    use Getopt::Std;
    use vars qw($opt_v $opt_h $opt_e);
    getopts("hve:");

    if ($opt_v) { print "$VERSION\n"; exit; }
    elsif ($opt_h || !@ARGV) { &help(); exit; }

    ($#ARGV==0 && $ARGV[0]=~/^http:\/\//) ||
        die "Format: report-new.pl http://...\n";

    my ($urlbase, $url);
    $urlbase = $url = shift; # E.g.: http://www.cs.dal.ca/~vlado/srcperl
    if ( $url =~ m.//[^/]*/. )
    { $urlbase = $`.$& } # E.g.: http://www.cs.dal.ca/

    my $urlId = &encode_w1($url);
    my $timestamp = strftime("%Y-%m-%d-%T", localtime(time));

    if (! -d 'tmp')
    { mkdir 'tmp', 0700 or die "can't mkdir tmp: $!" }
    if (! -d 'report-new.pl.d')
    { mkdir 'report-new.pl.d', 0700 or die "can't mkdir report-new.pl.d: $!" }

    my $TmpBase = "tmp/$urlId-$timestamp";
    my $TmpFile1 = "$TmpBase-1";
    my $lastFile = "report-new.pl.d/$urlId.last";
    -e $lastFile or putfile($lastFile,'');

    # First step: grab the page
    $url =~ s/'/'"'"'/g;
    my $material = `lynx -dump -nolist '$url'`;
    putfile($TmpFile1, $material);

    $material = `diff $TmpFile1 $lastFile 2>&1`;
    $material =~ s/^[^<].*\n//mg;
    $material =~ s/^< //mg;

    if ($material) {
        if ($opt_e) {
            my $out;
            open($out, "|sendmail -t") or die;
            print $out "To: $opt_e\n".
                 "Subject: [report-new.pl] $url\n\n$material";
            close($out);
        }
        else { print $material }
    }
    unlink($lastFile);
    rename($TmpFile1, $lastFile);

    sub putfile($@) {
        my $f = shift;
        local *F;
        open(F, ">$f") or die "putfile:cannot open $f:$!";
        print F '' unless @_;
        while (@_) { print F shift(@_) }
        close(F)
        }

    sub encode_w1 {
        local $_ = shift;
        s/[\W_]/'_'.uc unpack("H2",$&)/ge;
        return $_;
    }

    ## END



    This archive was generated by hypermail 2b29 : Thu Aug 11 2005 - 00:21:19 MET DST