#!/usr/local/bin/perl

# This software is Copyright (c) 1995 Jeff Weisberg <jaw@op.net>
# Permission is granted to use, copy and distribute this software
# under the following conditions:
#     -   This license covers the original software, as well as
#         modified or derived works.
#     -   All modified or derived works must contain this notice
#         unmodified and in its entirety.
#     -   This software is not to be used for any purpose which
#         may be considered illegal, immoral, or unethical.
#     -   This software is provided as is and without warranty.


# this program generates stats on usenet article count and volume
# generates html (oooh, aaah!)

# log files to analyze are generated from inn, using a
# newsfeed line similar to:
# STATS!:*:Tf,WtbNs:/var/adm/news-stats

# here at OpNet, we rotate the stats log nightly (in news.daily)
# and keep the past 10 days files online (gzipped), and
# analyze them (roughly) weekly


$k = 100000000;			# bignum base
$a = 1000;			# scale factor (~1k)
$t = 0;				# tsize overflow
$tsize = 0;

$nshow = 25;			# how many to show for Top N

$outfile = "/tmp/usenet-stats.html"; # send html output where?


while( <> ){
    print "C:$tcount S:$t:$tsize\n" unless( ($tcount % 10000));

    ($time, $size, $groups, $from ) = split( /\s/ );
    @groups = split(/,/, $groups);
    $start = $time unless $start;
    $end   = $time unless $end;
    $start = $time if $time < $start;
    $end   = $time if $time > $end;
    $tsize += $size;
    $tcount ++;

    
    # tally where from
    $ourfeeds{ $from } ++;
    
    # it only takes a few days for news volume to overflow an int
    # so we use a pair of ints as "bignums"
    # sigh, ...
    if( $tsize >  $k){
	$t ++;
	$tsize -= $k;
    }

    $es = $size / @groups;
    foreach $g ( @groups ){
	$size_by_group{ $g } += $es;
	$count_by_group{ $g } ++;
	if( $size_by_group{ $g }  > $k){
	    $size_by_group{ $g } -= $k;
	    $size_by_group_ov{ $g } ++;
	}
	
	# people put some really odd things in newsgroup headers...
	$h = $g;
	$h =~ s/\.+/\./g;
	$h =~ s/^\.//;
	$h =~ s/\.$//;
	
	while( $h ){
	    $size_by_hier{ $h } +=$es;
	    $count_by_hier{ $h } ++;
	    if( $size_by_hier{ $h }  > $k){
		$size_by_hier{ $h } -= $k;
		$size_by_hier_ov{ $h } ++;
	    }
	    $h =~ s/\.?[^.]+\.?$//;
	}
    }

    # tally by when

    ($sec, $min, $hrs, $dom, $mon, $yr, $dow, $doy) = localtime($time);
    $day = sprintf("%.4d/%.2d/%.2d", $yr+1900, $mon+1, $dom);
    print STDERR "$day, $hrs\n" unless( ($tcount % 10000));
    
    $count_by_day{ $day } ++;
    $size_by_day{ $day } += $size;
    if( $size_by_day{ $day }  > $k){
	$size_by_day{ $day }-= $k;
	$size_by_day_ov{ $day } ++;
    }
    $count_by_hrs[ $hrs ] ++;
    $size_by_hrs[ $hrs ] += $size;
    if( $size_by_hrs[ $hrs ]  > $k){
	$size_by_hrs[ $hrs ]-= $k;
	$size_by_hrs_ov[ $hrs ] ++;
    }
}

# we have to canonicalize our bignums into a usable form
print "Adjusting...\n";
$tsize /= $a;
$tsize += $t * ($k / $a);

print "C:$tcount S:$tsize\n";

foreach $g ( keys %size_by_group){
    $ss = $size_by_group{$g} / $a;
    $ss += $size_by_group_ov{$g} * ($k / $a);
    $size_by_group{$g} = $ss;
}
foreach $g ( keys %size_by_hier){
    $ss = $size_by_hier{$g} / $a;
    $ss += $size_by_hier_ov{$g} * ($k / $a);
    $size_by_hier{$g} = $ss;
}
foreach $day ( keys %size_by_day){
    $ss = $size_by_day{ $day } / $a;
    $ss += $size_by_day_ov{ $day } * ($k / $a);
    $size_by_day{ $day } = $ss;
}
foreach $hrs ( 0..23 ){
    $ss = $size_by_hrs[ $hrs ] / $a;
    $ss += $size_by_hrs_ov[ $hrs ] * ($k / $a);
    $size_by_hrs[ $hrs ] = $ss;
}

$dt = $end - $start;
$adj = (3600 * 24)/$dt;
$ndays = sprintf("%.2f", 1/$adj);
    
print "Summarizing...\n";

# show where from stats
$~ = 'FEEDS';
foreach $w ( sort({$ourfeeds{$b} <=> $ourfeeds{$a};} keys %ourfeeds) ){
    $n = $ourfeeds{$w};
    $p = $n * 100 / $tcount;
    write;
}


# spew forth html

# note: /img/dot_peri.gif is a 1pixel*1pixel gif
# 1pixel gif idea inspired by Mr. "Tekton" www.best.com/~dsiegel

open( HTML, "> $outfile" );

$avdc = sprintf("%.2f", $tcount * $adj);
$avds = sprintf("%.2f", $tsize  * $adj / 1024);

print HTML <<EOH;
<HTML><TITLE>Usenet Stats</TITLE><BODY>
<H2>Usenet Stats</H2>
The following summarizes the average daily Usenet traffic
passing through
<A HREF="http://www.op.net/">OpNet</A>
during the past $ndays days.

<UL>    
<LI><A HREF="#SBG">Usenet Average Daily Traffic Analysis: volume by group</A>
<LI><A HREF="#CBG">Usenet Average Daily Traffic Analysis: article count by group</A>
<LI><A HREF="#SBH">Usenet Average Daily Traffic Analysis: volume by hierarchy</A>
<LI><A HREF="#CBH">Usenet Average Daily Traffic Analysis: article count by hierarchy</A>
<LI><A HREF="#BYHR">Usenet Average Daily Traffic Analysis: by hour</A>
<LI><A HREF="#BYDAY">Usenet Traffic Analysis: by day</A>
</UL>    

<B>average daily article count:</B> $avdc<BR>
<B>average daily article volume:</B> ${avds}M<P>

EOH
    ;

$i = 0;
print HTML "<A NAME=\"SBG\">Usenet Average Daily Traffic Analysis: volume by group</A>\n";
print HTML "<TABLE BORDER>\n";
print HTML "<TR><TH>rank</TH><TH>Megabytes</TH><TH>percent</TH><TH>newsgroup</TH></TR>\n";

foreach $g ( sort({$size_by_group{$b} <=> $size_by_group{$a};}  keys %size_by_group)){
    last if $i++==$nshow;

    $ss = $size_by_group{$g};
    $s = sprintf( "%.2f", $ss / 1024 * $adj );
    $p = sprintf( "%.2f", $ss * 100.0 / $tsize);

    $graph = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($p * 10) . ">";
    print HTML "<TR><TD>$i</TD><TD>$s</TD><TD>$p</TD><TD>$g</TD><TD>$graph</TD></TR>\n";
}
print HTML "</TABLE><P>\n";

$i = 0;
print HTML "<A NAME=\"CBG\">Usenet Average Daily Traffic Analysis: article count by group</A>\n";
print HTML "<TABLE BORDER>\n";
print HTML "<TR><TH>rank</TH><TH>articles</TH><TH>percent</TH><TH>newsgroup</TH></TR>\n";

foreach $g ( sort({$count_by_group{$b} <=> $count_by_group{$a};} keys %count_by_group)){
    last if $i++==$nshow;

    $s = sprintf( "%.2f", $count_by_group{$g} * $adj);
    $p = sprintf( "%.2f", $count_by_group{$g} * 100 / $tcount);
    $graph = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($p * 100) . ">";
    print HTML "<TR><TD>$i</TD><TD>$s</TD><TD>$p</TD><TD>$g</TD><TD>$graph</TD></TR>\n";
}
print HTML "</TABLE><P>\n";

$i = 0;
print HTML "<A NAME=\"SBH\">Usenet Average Daily Traffic Analysis: volume by hierarchy</A>\n";
print HTML "<TABLE BORDER>\n";
print HTML "<TR><TH>rank</TH><TH>Megabytes</TH><TH>percent</TH><TH>hierarchy</TH></TR>\n";
foreach $g ( sort({$size_by_hier{$b} <=> $size_by_hier{$a};} keys %size_by_hier)){
    last if $i++==$nshow;

    $ss = $size_by_hier{$g};
    $s = sprintf( "%.2f", $ss / 1024 * $adj);
    $p = sprintf( "%.2f", $ss * 100 / $tsize);
    $graph = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($p * 2) . ">";
    print HTML "<TR><TD>$i</TD><TD>$s</TD><TD>$p</TD><TD>$g</TD><TD>$graph</TD></TR>\n";
}
print HTML "</TABLE><P>\n";

$i = 0;
print HTML "<A NAME=\"CBH\">Usenet Average Daily Traffic Analysis: article count by hierarchy</A>\n";
print HTML "<TABLE BORDER>\n";
print HTML "<TR><TH>rank</TH><TH>articles</TH><TH>percent</TH><TH>hierarchy</TH></TR>\n";
foreach $g ( sort({$count_by_hier{$b} <=> $count_by_hier{$a};} keys %count_by_hier)){
    last if $i++==$nshow;

    $s = sprintf( "%.2f", $count_by_hier{$g} * $adj);
    $p = sprintf( "%.2f", $count_by_hier{$g} * 100 / $tcount);
    $graph = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($p * 2) . ">";
    print HTML "<TR><TD>$i</TD><TD>$s</TD><TD>$p</TD><TD>$g</TD><TD>$graph</TD></TR>\n";
}
print HTML "</TABLE><P>\n";

print HTML "<A NAME=\"BYDAY\">Usenet Traffic Analysis: by day</A>\n";
print HTML "<TABLE BORDER>\n";
print HTML "<TR><TH>day</TH><TH>Megabytes</TH><TH>articles</TH><TH>size</TH><TH>count</TH></TR>\n";
foreach $day ( sort( keys %count_by_day )){

    $sa = $size_by_day{ $day };
    $sm = sprintf( "%.2f", $sa / 1024);
    $pm = sprintf( "%.2f", $sa / 1024 * 100 / $avds);
    $sb = $count_by_day{ $day };
    $sc = sprintf( "%.2f", $sb);
    $pc = sprintf( "%.2f", $sb * 100 / $avdc);
    $graphm = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($pm * 1) . ">";
    $graphc = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($pc * 1) . ">";
    print HTML "<TR><TD>$day</TD><TD>$sm</TD><TD>$sc</TD><TD>$graphm</TD><TD>$graphc</TD></TR>\n";

}
print HTML "</TABLE><P>\n";

print HTML "<A NAME=\"BYHR\">Usenet Average Daily Traffic Analysis: by hour</A>\n";
print HTML "<TABLE BORDER>\n";
print HTML "<TR><TH>hour</TH><TH>Megabytes</TH><TH>articles</TH><TH>size</TH><TH>count</TH></TR>\n";
foreach $hr ( 0..23 ){

    $sa = $size_by_hrs[ $hr ];
    $sm = sprintf( "%.2f", $sa / 1024 * $adj);
    $pm = sprintf( "%.2f", $sa * 100 / $tsize);
    $sb = $count_by_hrs[ $hr ];
    $sc = sprintf( "%.2f", $sb * $adj);
    $pc = sprintf( "%.2f", $sb * 100 / $tcount); 
    $graphm = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($pm * 15) . ">";
    $graphc = "<IMG SRC=/img/dot_peri.gif HEIGHT=8 WIDTH=". int($pc * 15) . ">";
    print HTML "<TR><TD>$hr</TD><TD>$sm</TD><TD>$sc</TD><TD>$graphm</TD><TD>$graphc</TD></TR>\n";

}
print HTML "</TABLE><P>\n";


print HTML <<EOF;
<!--#include virtual="/footer.shtml" -->
</BODY>
</HTML>    
EOF
    ;


format FEEDS =
@<<<<<<<<<<<<<<<<<<< @###.##  @>>>>>
$w,                  $p,      $n
.
    ;

