#include <algorithm>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <cstdlib>
#include <ext/hash_map>
#include <fstream>
#include <iostream>
#include <pcrecpp.h>
#include <string>
#include <vector>

using namespace boost;
using namespace boost::algorithm;

pcrecpp::RE re("^/ongoing/When/\\d\\d\\dx/\\d\\d\\d\\d/\\d\\d/\\d\\d/[^ .]+$");
pcrecpp::RE re2("^http://www.tbray.org/ongoing/");

struct StringHash {
   std::size_t operator() (std::string const & str) const {
      return __gnu_cxx::hash<char const *>()(str.c_str());
   }
};

typedef __gnu_cxx::hash_map<std::string, unsigned long long, StringHash> Hash;

Hash s404s, u_bytes, u_hits, clients, refs;

void analyse_line(std::string const & line)
{
   std::vector<std::string> parts;
   split(parts, line, is_any_of(" "));

   if (parts[5] != "\"GET") return;
   if (parts[8] == "404") {
      s404s[parts[6]]++;
      return;
   }
   std::size_t size;
   if (parts[8] == "304") {
      size = 0;
   } else if (parts[8] == "200") {
      size = strtol(parts[9].c_str(), NULL, 10);
   } else {
      return;
   }
   u_bytes[parts[6]] += size;
   if (re.FullMatch(parts[6])) {
      u_hits[parts[6]]++;
      clients[parts[0]]++;
      std::string ref = parts[10].substr(1, parts[10].size() - 2);
      if (parts[10] != "\"-\"" and parts[10] !=  "\"\"" and not re2.PartialMatch(ref)) {
         refs[ref]++;
      }
   }
}

bool cmp(std::pair<std::string, unsigned long long> const & a,
         std::pair<std::string, unsigned long long> const & b)
{
   return a.second > b.second;
}

void report(Hash & set, char const * str, bool schrink = false)
{
   std::cout << "Top " << str << ":\n";
   std::vector<std::pair<std::string, unsigned long long> > vec(10);
   std::partial_sort_copy(set.begin(), set.end(),
                          vec.begin(), vec.end(), cmp);

   for (std::size_t i = 0; i < vec.size() and i < set.size(); ++i) {
      if (schrink) {
         fprintf(stdout, " %9.1fM: %s\n", vec[i].second / (1024.0 * 1024.0),
                                          vec[i].first.c_str());
      } else {
         fprintf(stdout, " %10llu: %s\n", vec[i].second,
                                          vec[i].first.c_str());
      }
   }
   std::cout << std::endl;
}

int main(int argc, char ** argv)
{
   if (argc != 2) {
      std::cout << "Wrong number of arguments: " << argv[0]  << " filename\n";
      exit(-1);
   }

   std::fstream file(argv[1]);
   if (not file) {
      std::cout << "Could not open file: " << argv[1] << std::endl;
      exit(-1);
   }
   
   std::string line;
   while (std::getline(file, line)) {
      analyse_line(line);
   }

   std::cout << u_hits.size() << " resources, " << s404s.size() << " 404s, "
             << clients.size() << " clients\n\n";

   report(u_hits,  "URIs by hit");
   report(u_bytes, "URIs by bytes", true);
   report(s404s,   "404s");
   report(clients, "client addresses");
   report(refs,    "referrers");
}
