/*

       This program produces web site statistics from log file records.

 

       Various reports are produced as a function of criteria supplied by the user.

 

       Depending on the time period specified by the user, one or more log files

       must be accessed.  There is always a current "access" log file and possibly

       one or more archive log files.  Archive files are gzipped and named to

       reflect the date.

 

       To speed processing, as little time as possible is spent reading log records.

       Fields are read into work areas and later distributed and processed as needed.

 

       Arrays are built driven by the needs of the various reports.  If an array fills

       up, it is enlarged using "realloc."  Arrays are sorted using quicksort after

       they are finished being populated.

 

       Report 1: Top n clients accessing the site.  Shows a count and the domain name.

              Sorted by count, highest first.

 

       Report 2: Top n files accessed.  Shows count and file (e.g. a gif or an HTML page).

              Sorted by count, highest first.

 

       Report 3: Files containing the string "whatever."  Shows count and client (IP).

              Sorted by count, highest first.

 

       Report 4: Period totals for site.  Shows individual clients, all clients, hits,

              page hits, KB transmitted.  Individual clients = unique IP's; all clients

              counts an IP as many times as days it occurs, and = the sum of the "clients"

              column in report 5; hits is sum of "hits" column in report 5; page hits =

              sum of "page hits" column in report 5; KB transmitted = sum of "KB" column

              in report 5.

 

       Report 5: Daily Totals for site.  Shows date, clients, hits, page hits, KB transmitted.

              Date is in form: weekday (3 char), e.g. Mon; month (3 char), day (no leading zero),

              year (4 digits).  KB is to two decimal places.

 

       Report 6: Daily Averages.  Shows same columns as report 5.  Values = report 4 values

              divided by number of days in period.  Clients here is based on "all clients"

              from report 4, rather than "individual clients."

 

       Report 7: Hourly averages.  Shows hour, hits, percentage, and KB transmitted.

              Hour appears as e.g. "Midnight to 1 am" or "1 am to 2 am,"  Percentage is

              nn.n %.

 

       Report 8: Summary of HTTP errors.  Shows count, error code (e.g. 404), text (e.g. Not Found).

              Sorted by count, highest first.

 

       Report 9: (Top 20) requests causing errors.  Shows count and request (file).

              Sorted by count, highest first.

 

       Design notes:

 

              Since this isn't Perl, and none of us can figure out how to redirect standard output

              from a shell command back to this C program's standard in, the output of the cat or

              gzcat is to a work file.

 

              For reports 4-9 we read in the log records one-by-one.  But we only read each

              work file once, extracting all of the data for all of those reports in one pass.

 

              The processing for reports 1-3 is affected by the fact that there can be from a handful

              to millions of log records to be processed for a given user submission of the stats

              engine.  This precludes building an array in-core to hold information for each ip

              encountered, e.g., without first having determined how many occur.  And even if we

              built an array based on that count of unique ip's, if we read each log record we have

              to search the array each time to find the right entry to increment the count for.

              So, a hash table is a better data structure to use.

 

              For these reports, we extract selected log fields from each relevant log file

              (based on the user-supplied date range) using shell commands (via system calls)

              and concatenate this output to a work file.  This allows us to determine how many

              hash table entries to allocate for the primary array.  We also can perform a number

              of useful operations on the combined log data.

 

              For the purposes of report 1 we "cut" extract just the first field, which is the ip,

              from each log record.*  After cutting from all relevant files, this gives us a file of

              all occurrences of ip's.  We sort this and pipe to "uniq" which gives us just the unique

              ip's that occurred.  A "wc" of that tells us how many ip's to take into account when

              allocating the primary hash table stucture.

 

              We can now populate the hash table.  Reading in a record from the file with all ip

              occurrences, we directly hash to the right entry and increment its count.  In case of

              the entry already being populated by a different ip (a synonym situation), we chain a

              linked list off the main entry so that each different ip mapping to that same primary

              entry gets its own entry.

 

              When we're done accumulating occurrences, we can produce report 1.

 

              For report 2 we employ a similar strategy to that used for report 1.  We extract the

              "file" field instead of the ip field.  A hash table is eventually built based on the

              file combining the "file" fields from all relevant log files.

 

              For report 3 we read the log records as in reports 4-9, but proceed more like we do for

              reports 1 and 2.  We look for records whose "file" field contains the user-supplied string.

              For those that do, we write a record to a file with just the ip.  After all log files

              have been read, we process the resulting ip file.  We sort it and uniq it to get the

              number of unique ip's.  We read this in and allocate an array with that many entries.

              We read the sorted ip file and accumulate the count of occurrences for each ip there.

              We then write the ip's and counts to a file and sort it by count, descending.  We read

              that in and produce the report.

 

              *Since for report 5 we need ip information within date, we build a work file with both

              ip and date.  We use this later for report 1, cutting the ip column to create the ip file.

 

*/

 

#include "stdio.h"

#include "string.h"

#include "time.h"

#include "math.h"

#include "stdlib.h"

#include "unistd.h"       /* For getpid, execle commands */

#include <sys/types.h>

#include <sys/socket.h>

#include <netinet/in.h>

#include <arpa/inet.h>

#include <netdb.h>

#include <ctype.h>

 

/* Multiplied times number of unique ip's when malloc'ing hash table */

#define scaling_factor 3

 

char ip[8192];            /* Host   (IP address)       */

char ident[256];       /* Ident field          */

char authuser[256];      /* Authuser             */

char timestamp1[256];    /* Time Stamp part 1 */

char timestamp2[256];    /* Time Stamp part 2 */

char file1[256];       /* HTTP Request       part 1       */

char file2[256];       /* HTTP Request       part 2       */

char file3[256];       /* HTTP Request       part 3       */

char status[256];       /* Status Code          */

long bytes;          /* Transfer Volume */

 

char *blank_date = "           ";     /* 11 blanks */

char  blank_ip[16];

char  blank_string[256];

char *buff;

char  buffer[1024];

char *buff2;

long  bump1 = 1000;

long  bump2 = 100;

long  bump3 = 10;

char  cat_string[256];

char  cgipath[10] = "/tmp/";

/*char  cgipath[44] = "/Space/Domains/stats.simplenet.net/cgi-bin/";*/

char  char_oldest_day[4];

char  char_oldest_year[5];

char  char_youngest_day[4];

char  char_youngest_year[5];

int   code_red_found = 0;

char  curr_file[100];

long  curr_s1 = 0;

long  curr_s2 = 0;

long  curr_s3 = 0;

char *dashed_line = "\n\t\t\t---------------------------\n\n";

char  date[12];

long  date_bias;

char  date_save[12];

char  domain[256];

char  end_date[12];

char  end_day[3];

char  end_month[3];

char  end_year[5];

long  endloop;

char  enterprise_string[64];

char *enterprise_string1 = "format=%Ses->client.ip%";

char *enterprise_string2 = "%Req->srvhdrs.content-length%";

long  entire_period = 0;

long  e400_count = 0;

char *e400_lit = "Syntax error         ";

long  e401_count = 0;

char *e401_lit = "Unauthorized         ";

long  e402_count = 0;

char *e402_lit = "Unauthorized         ";

long  e403_count = 0;

char *e403_lit = "Forbidden            ";

long  e404_count = 0;

char *e404_lit = "Not found            ";

long  e405_count = 0;

char *e405_lit = "Not found            ";

long  e406_count = 0;

char *e406_lit = "Internal server error";

long  e410_count = 0;

char *e410_lit = "No longer available  ";

long  e500_count = 0;

char *e500_lit = "Internal server error";

long  e501_count = 0;

char *e501_lit = "Not implemented      ";

long  e502_count = 0;

char *e502_lit = "Bad gateway          ";

long  e503_count = 0;

char *e503_lit = "Service unavailable  ";

long  e504_count = 0;

char *e504_lit = "Bad gateway          ";

char  error_code_file_string[256] = "error_code_file_";

char  file2_buffer[8192];

int   file10_open = 0;

int   file11_open = 0;

int   file12_open = 0;

int   file15_open = 0;

int   file16_open = 0;

long  file_date_status = 0;

char  file_list_string[256] = "file_list_";

long  fqdn_limit;

char  header_work[256];

char  header_work2[256];

long  indexx;

char *input_parm_file;

long  int_start_year;

long  int_start_month;

long  int_start_day;

long  int_end_year;

long  int_end_month;

long  int_end_day;

long  ip_date_file_removed = 0;

char  ip_date_file_string[256] = "ip_date_file_";

char  ip_save[256];

int   null_found = 0;

long  num_clients;

long  num_dates;

long  num_dates_used = 0;

long  num_uniq_items;

long  oldest_year;

long  oldest_month;

long  oldest_day;

char  outfile_string[256] = "outfile_";

char  parm_line[256];

char  pid[256] = "\0";

char  plural[2];

char  random_suffix_char[2] = " ";

int   rc = 0;

char  rec_date[12];

char  rec_day[3];

int   rec_day_int;

long  rec_hour;

char  rec_month[4];

int   rec_month_int;

long  rec_too_new;

char  rec_year[3];

int   rec_year_int;

char  rec_year_long[5];

long  record_count = 0;

char  report1_ip_file_string[256] = "report1_ip_file_";

char  report2_file_file_string[256] = "report2_file_file_";

char  report3_ip_file_string[256] = "report3_ip_file_";

char  report9_file_file_string[256] = "report9_file_file_";

char  report_file_string[256] = "report_file_";

long  report_index;

char  report_input_file[256];

long  report_limit;

char  report_lit[256];

long  report_num;

char  report_work[256];

char  r1_exclude_string[16];

long  r1_limit = 30;

char *r1_lit = "Top %ld Clients Accessing %s:\n\n";

char  r2_exclude_string[256];

long  r2_limit = 40;                  /* User supplied */

char *r2_lit = "Top %ld Files Accessed on %s:\n\n";

long  r3_limit = 40;

char *r3_lit = "Clients accessing files containing the string \"%s\" in this period:\n\n";

char  r3_string[256];

int   r5sorted_ip_date_file_created = 0;

int   r5sorted_ip_date_file_removed = 0;

char  r5sorted_ip_date_file_string[256] = "r5sorted_ip_date_file_";

long  r5_data_needed = 0;

long  r9_limit = 20;

char *r9_lit = "Top %ld Requests Causing Errors:\n\n";

char  sorted_error_code_file_string[256] = "sorted_error_code_file_";

char  sorted_report_file_string[256] = "sorted_report_file_";

char  sort_string[256];

char  start_date[12];

char  start_day[3];

char  start_month[3];

char  start_year[5];

char  std_error_file_string[256] = "std_error_file_";

char  std_error_file_string2[256] = "std_error_file2_";

long  tot_clients;

long  tot_hits;

float tot_kb;

long  tot_page_hits;

char  uniq_ip_count_string[256] = "uniq_ip_count_";

char  uniq_ip_list_string[256] = "uniq_ip_list_";

char  uniq_item_count_string[256] = "uniq_item_count_";

char  uniq_item_list_string[256] = "uniq_item_list_";

char *val;

char  work_bytes[256];

long  work_count;

char  work_date[12];

char  work_day[4];

char  work_daynum[3];

char  work_file_string[256] = "work_file_";

char  work_mon[4];

char  work_string1[256];

char  work_string2[256];

char  work_string3[256];

char  work_string4[256];

char  work_string5[256];

char  work_year[5];

long  y;

long  youngest_year;

long  youngest_month;

long  youngest_day;

 

typedef struct struc_1

{

       char   s1_string[256];

       long   s1_count;

       struct struc_1 *s1_next_hash_entry;

}d1;

 

struct struc_1 *report_hash;

struct struc_1 *report_array;      /* Really an array, not used as a hash table.  See create_report1_file(). */

 

typedef struct struc_3

{

       char   s3_date[12];

       long   s3_clients;

       long   s3_hits;

       long   s3_page_hits;

       float  s3_kbytes;

       struct struc_3 *s3_next_hash_entry;

}d3;

 

struct struc_3 *report5_hash;

 

typedef struct struc_4

{

       char  s4_time[21];

       long  s4_hits;

       float s4_kbytes;

}d4;

 

struct struc_4 *report7_array;

 

struct tm *time_ptr;

time_t lt;

 

FILE *fptr1;       /* log file list            */

FILE *fptr2;       /* cat'd log file          */

FILE *fptr3;       /* stats report file          */

FILE *fptr4;       /* count of unique ip's          */

FILE *fptr5;       /* ip from each log rec    */

FILE *fptr6;       /* report initial data as i/p */

FILE *fptr7;       /* count of report ip's or files*/

FILE *fptr8;       /* report data to sort         */

FILE *fptr9;       /* sorted report data          */

FILE *fptr10;       /* ip_date_file                */

FILE *fptr11;       /* report1_ip_file            */

FILE *fptr12;       /* report2_file_file          */

FILE *fptr13;       /* error code file          */

FILE *fptr14;       /* sorted error code file       */

FILE *fptr15;       /* report3 initial data as o/p */

FILE *fptr16;       /* report9 initial data as o/p */

FILE *fptr17;       /* input parameter file          */

 

long  check_date_range();

void  check_file_date();

long  check_for_match();

long  check_for_wildcards();

void  close_some_files();

int   create_record_stream();

void  create_report();

void  create_report_file();

char *dayname();

void  do_reports();

char *error_lit();

void  estimate_num_dates_entire();

void  estimate_num_dates_specific();

void  get_count();

void  get_dates();

void  get_domain_name();

void  get_form_input();

void  get_ip_count();

void  get_random_suffix();

void  get_time();

char *get_value();

long  hash_date();

long  hash_file();

long  hash_ip();

void  init();

void  init_report5_hash();

void  init_report7_array();

void  init_report_array();

void  init_report_hash();

char *month_name();

long  month_num();

void  omit_header();

void  open_report_input_file();

void  open_some_files();

void  output_report5();

long  page_hit();

void  parse_input();

void  populate_report_hash();

char *prepare_filename_string(char *);

void  prepare_filename_strings();

void  prepare_filename_suffix();

void  prepare_file_list();

void  prepare_parm_input();

void  print_glossary();

void  print_header();

void  print_report_hash();

void  print_sorry();

void  print_time();

void  report();

void  report_setup();

void  reports1and4_setup();

void  report1_setup();

void  report2_setup();

void  report3_ongoing();

void  report3_setup();

void  report4();

void  report5();

void  report5_init();

void  report5_ongoing();

void  report6();