Merge "Consolidated files, replaced previous logging with receiving DNS queries from socket. Periodically fetches popular hosts."
diff --git a/cache_warming/cache_warming.py b/cache_warming/cache_warming.py
new file mode 100644
index 0000000..7b76fc3
--- /dev/null
+++ b/cache_warming/cache_warming.py
@@ -0,0 +1,169 @@
+#!/usr/bin/python
+"""Updates hit log and periodically fetches top requested hosts.
+
+Loads previousy saved top requested hosts. Reads DNS queries
+through socket and updates hit log dictionary with most recent
+hit time and hit count for each host. Periodically sorts hosts
+in hit log by number of hits, fetches a predetermined number of
+the top requested hosts, and saves fetched hosts.
+"""
+
+import argparse
+from datetime import datetime
+import json
+import os
+import socket
+import dns.resolver
+
+hit_log = {}
+last_fetch = datetime.min
+TOP_N = 50
+FETCH_INTERVAL = 60  # seconds
+UDP_SERVER_PATH = '/tmp/dns_query_log_socket'
+HOSTS_JSON_PATH = '/config/cache_warming_hosts.json'
+
+
+def save_hosts(log):
+  """Saves predetermined number of top requested hosts in json file.
+
+  Stores dictionary with host key and tuple value containing most recent hit
+  time and hit count.
+
+  Args:
+    log: Dictionary of top requested hosts with host key and tuple value
+         containing most recent hit time and hit count.
+  """
+  d = os.path.dirname(HOSTS_JSON_PATH)
+  if not os.path.exists(d):
+    os.makedirs(d)
+  with open(HOSTS_JSON_PATH, 'w') as hosts_json:
+    json.dump(log, hosts_json)
+
+
+def load_hosts():
+  """Loads hosts stored in json file.
+
+  Loads dictionary with host key and tuple value containing most recent hit
+  time and hit count as hit_log if it exists.
+  """
+  if os.path.isfile(HOSTS_JSON_PATH):
+    with open(HOSTS_JSON_PATH, 'r') as hosts_json:
+      global hit_log
+      hit_log = json.load(hosts_json)
+
+
+def process_query(qry):
+  """Processes DNS query and updates hit log.
+
+  Parses DNS query and updates requested host's hit count and
+  most recent hit time in hit log.
+
+  Args:
+    qry: String representing a DNS query of the format
+            '[Unix time] [host name]'.
+  """
+  time, _, host = qry.partition(' ')
+  if host in hit_log:
+    hit_log[host] = (hit_log[host][0] + 1, time)
+  else:
+    hit_log[host] = (1, time)
+
+
+def hit_log_subset(hosts):
+  """Makes a subset of hit log containing selected hosts.
+
+  Args:
+    hosts: List of hosts to be included in the subset of hit log.
+
+  Returns:
+    Dictionary of selected hosts with host key and tuple value
+    containing most recent hit time and hit count.
+  """
+  return {k: hit_log[k] for k in hosts}
+
+
+def fetch(hosts, port, server):
+  """Fetches hosts and saves subset of hit log containing fetched hosts.
+
+  Only fetches predetermined number of top requested hosts. If fetch
+  fails, host is removed from hit log and saved hosts list.
+
+  Args:
+    hosts: List of hosts to be fetched sorted by number of hits
+           in descending order.
+    port: Port to which to send queries (default is 53).
+    server: Alternate nameservers to query (default is None).
+  """
+  my_resolver = dns.resolver.Resolver()
+  my_resolver.port = port
+  if server is not None:
+    my_resolver.nameservers = server
+
+  if len(hosts) > TOP_N:
+    hosts = hosts[:TOP_N]
+  for host in hosts:
+    try:
+      my_resolver.query(host)
+    except:
+      del hit_log[host]
+      hosts.remove(host)
+
+  save_hosts(hit_log_subset(hosts))
+
+
+def sort_hit_log():
+  """Sorts hosts in hit log by number of hits.
+
+  Returns:
+    A list of hosts sorted by number of hits in descending order.
+  """
+  return sorted(hit_log, key=hit_log.get, reverse=True)
+
+
+def warm_cache():
+  """Warms cache with predetermined number of most requested hosts.
+
+  Sorts hosts in hit log by hit count, fetches predetermined
+  number of top requested hosts, updates last fetch time.
+  """
+  sorted_hosts = sort_hit_log()
+  fetch(sorted_hosts, args.port, args.server)
+  global last_fetch
+  last_fetch = datetime.now()
+
+
+def set_args():
+  """Sets arguments for script.
+
+  Returns:
+    List of arguments containing port and server.
+  """
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-p', '--port', nargs='?', default=53, type=int,
+                      help='port to which to send queries (default is 53).')
+  parser.add_argument('-s', '--server', nargs='*', type=str,
+                      help='alternate nameservers to query (default is None).')
+  return parser.parse_args()
+
+
+if __name__ == '__main__':
+  args = set_args()
+  load_hosts()
+
+  server_address = UDP_SERVER_PATH
+  try:
+    os.remove(server_address)
+  except OSError:
+    if os.path.exists(server_address):
+      raise
+  sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
+  sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+  sock.bind(server_address)
+  os.chmod(server_address, 0o777)
+
+  while 1:
+    diff = datetime.now() - last_fetch
+    if diff.total_seconds() > 60:
+      warm_cache()
+    data = sock.recv(128)
+    process_query(data)
diff --git a/cache_warming/cache_warming_test.py b/cache_warming/cache_warming_test.py
new file mode 100644
index 0000000..83ff1e8
--- /dev/null
+++ b/cache_warming/cache_warming_test.py
@@ -0,0 +1,75 @@
+#!/usr/bin/python
+"""Tests for cache_warming.py."""
+
+import cache_warming
+from wvtest import wvtest
+
+
+@wvtest.wvtest
+def testProcessQuery_firstHit():
+  qry = '123456789 www.yahoo.com'
+  expected = {'www.yahoo.com': (1, '123456789')}
+  cache_warming.hit_log = {}
+  cache_warming.process_query(qry)
+  actual = cache_warming.hit_log
+  wvtest.WVPASSEQ(actual, expected)
+
+
+@wvtest.wvtest
+def testProcessQuery_updateHitCount():
+  qry = '123456789 www.yahoo.com'
+  cache_warming.hit_log = {'www.yahoo.com': (1, '123456789')}
+  cache_warming.process_query(qry)
+  expected = 2
+  actual = cache_warming.hit_log['www.yahoo.com'][0]
+  wvtest.WVPASSEQ(actual, expected)
+
+
+@wvtest.wvtest
+def testProcessQuery_updateRecentHitTime():
+  qry = '123456789 www.yahoo.com'
+  cache_warming.hit_log = {'www.yahoo.com': (1, '987654321')}
+  cache_warming.process_query(qry)
+  expected = '123456789'
+  actual = cache_warming.hit_log['www.yahoo.com'][1]
+  wvtest.WVPASSEQ(actual, expected)
+
+
+@wvtest.wvtest
+def testSortHitLog_empty():
+  cache_warming.hit_log = {}
+  expected = []
+  actual = cache_warming.sort_hit_log()
+  wvtest.WVPASSEQ(actual, expected)
+
+
+@wvtest.wvtest
+def testSortHitLog_nonEmpty():
+  cache_warming.hit_log = {
+      'www.google.com': (2, '123456789'),
+      'www.yahoo.com': (1, '987654321'),
+      'www.espn.com': (3, '135792468')
+  }
+  expected = ['www.espn.com', 'www.google.com', 'www.yahoo.com']
+  actual = cache_warming.sort_hit_log()
+  wvtest.WVPASSEQ(actual, expected)
+
+@wvtest.wvtest
+def testHitLogSubset():
+  hosts = ['www.google.com', 'www.yahoo.com']
+  cache_warming.hit_log =   cache_warming.hit_log = {
+      'www.youtube.com': (4, '987654321'),
+      'www.google.com': (1, '987654321'),
+      'www.espn.com': (3, '123456789'),
+      'www.yahoo.com': (2, '135792468')
+  }
+  expected = {
+      'www.yahoo.com': (2, '135792468'),
+      'www.google.com': (1, '987654321')
+  }
+  actual = cache_warming.hit_log_subset(hosts)
+  wvtest.WVPASSEQ(actual, expected)
+
+
+if __name__ == '__main__':
+  wvtest.wvtest_main()
diff --git a/cache_warming/fetch_popular.py b/cache_warming/fetch_popular.py
deleted file mode 100644
index fe0f2b4..0000000
--- a/cache_warming/fetch_popular.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/python
-"""Pre-fetches top requested hosts.
-
-Sorts dictionary represented in hit_log.json by number of hits
-and sends DNS requests to a predetermined number of the top hosts.
-"""
-
-import argparse
-import json
-import dns.resolver
-
-TOP_N = 50
-HITS_LOG_JSON_PATH = '/tmp/hits_log.json'
-
-
-def sort_hits_log(path):
-  """Sorts hosts in hits log by number of hits.
-
-  Args:
-    path: Path of JSON representation of dictionary mapping host
-          to tuple of most recent hit time and hit count.
-
-  Returns:
-    A list of hosts sorted by number of hits in descending order.
-  """
-  try:
-    log_json = open(path, 'r')
-  except IOError:
-    print 'unable to open ' + path
-    raise
-  else:
-    log = json.load(log_json)
-    return sorted(log, key=log.get, reverse=True)
-
-
-def prefetch(hosts, port, server):
-  """Pre-fetches list of hosts.
-
-  Args:
-    hosts: List of hosts to be fetched sorted by number of hits
-           in descending order.
-    port: Port to which to send queries (default is 53).
-    server: Alternate nameservers to query (default is None).
-  """
-  my_resolver = dns.resolver.Resolver()
-  my_resolver.port = port
-  if server is not None:
-    my_resolver.nameservers = server
-
-  if len(hosts) > TOP_N:
-    hosts = hosts[:TOP_N]
-  for host in hosts:
-    my_resolver.query(host)
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument('-p', '--port', nargs='?', default=53, type=int,
-                      help='port to which to send queries (default is 53).')
-  parser.add_argument('-s', '--server', nargs='*', type=str,
-                      help='alternate nameservers to query (default is None).')
-  args = parser.parse_args()
-
-  sorted_log = sort_hits_log(HITS_LOG_JSON_PATH)
-  prefetch(sorted_log, args.port, args.server)
diff --git a/cache_warming/fetch_popular_test.py b/cache_warming/fetch_popular_test.py
deleted file mode 100644
index ea9da8d..0000000
--- a/cache_warming/fetch_popular_test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/python
-"""Tests for fetch_popular.py."""
-
-import os
-import fetch_popular
-from wvtest import wvtest
-
-
-@wvtest.wvtest
-def testSortHitsLog_empty():
-  try:
-    file_name = 'test_log.json'
-    with open(file_name, 'w') as f:
-      f.write('{}')
-
-    expected = []
-    actual = fetch_popular.sort_hits_log(file_name)
-    wvtest.WVPASSEQ(actual, expected)
-  finally:
-    os.remove(file_name)
-
-
-@wvtest.wvtest
-def testSortHitsLog_nonEmpty():
-  try:
-    file_name = 'test_log.json'
-    with open(file_name, 'w') as f:
-      f.write('{"www.google.com": [2, "123456789"], "www.yahoo.com":'
-      	       ' [1,"987654321"], "www.espn.com": [3, "135792468"]}')
-
-    expected = ['www.espn.com', 'www.google.com', 'www.yahoo.com']
-    actual = fetch_popular.sort_hits_log(file_name)
-    wvtest.WVPASSEQ(actual, expected)
-  finally:
-    os.remove(file_name)
-
-
-if __name__ == '__main__':
-  wvtest.wvtest_main()
diff --git a/cache_warming/log_hits.py b/cache_warming/log_hits.py
deleted file mode 100644
index 5fbc2e8..0000000
--- a/cache_warming/log_hits.py
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/usr/bin/python
-"""Updates most recent hit time and hit count for hosts in hits log.
-
-Reads queries from dns_query_log.txt and updates hosts in hits log
-dictionary with most recent hit time and hit count for each host.
-Saves hits log dictionary as hits_log.json for future modification.
-"""
-
-import json
-import os.path
-
-DNS_QUERY_LOG_PATH = '/tmp/dns_query_log.txt'
-HITS_LOG_JSON_PATH = '/tmp/hits_log.json'
-
-
-def process_line(log, ln):
-  """Processes a line of DNS query log and updates hits log.
-
-  Parses line and updates most recent hit time and hit count
-  for host in hits log.
-
-  Args:
-    log: Dictionary mapping host to tuple of hit count and most
-         recent hit time.
-    ln: String representing a line of DNS query log of the
-          format '[Unix time] [host name]'.
-
-  Returns:
-    An updated dictionary mapping host to tuple of hit count and
-    most recent hit time.
-  """
-  time, _, host = ln[:-1].partition(' ')
-  if host in log:
-    log[host] = (log[host][0] + 1, time)
-  else:
-    log[host] = (1, time)
-  return log
-
-
-def read_dns_query_log(path):
-  """Reads a DNS query log.
-
-  Processes each line of file, updating a hits log.
-
-  Args:
-    path: Path of DNS query log to be read.
-
-  Returns:
-    An updated dictionary mapping host to tuple of hit count and
-    most recent hit time.
-  """
-  try:
-    dns_query_log = open(path, 'r')
-  except IOError:
-    print 'unable to open ' + path
-  else:
-    log = {}
-    for line in dns_query_log:
-      log = process_line(log, line)
-    dns_query_log.close()
-    return log
-
-
-def clear_dns_query_log(path):
-  """Clears a DNS query log.
-
-  Opens file for write without writing anything.
-
-  Args:
-    path: Path of DNS query log to be cleared.
-  """
-  try:
-    open(path, 'w').close()
-    return
-  except IOError:
-    print 'unable to open ' + path
-
-
-def merge_logs(log, hist):
-  """Merges two hit logs.
-
-  Merges smaller hit log to larger hit log. Uses most recent hit
-  time and sums hit count from each log for each host.
-
-  Args:
-    log: Dictionary mapping host to tuple of hit count and
-         most recent hit time.
-    hist: Similar dictionary representing previous query history.
-
-  Returns:
-    An updated dictionary mapping host to tuple of hit count and
-    most recent hit time.
-  """
-  hist_larger = len(hist) > len(log)
-  big_log, small_log = (hist, log) if hist_larger else (log, hist)
-  for k, v in small_log.iteritems():
-    if k in big_log:
-      time = log[k][1]
-      big_log[k] = (big_log[k][0] + v[0], time)
-    else:
-      big_log[k] = (v[0], v[1])
-  return big_log
-
-
-if __name__ == '__main__':
-  hit_log = read_dns_query_log(DNS_QUERY_LOG_PATH)
-  clear_dns_query_log(DNS_QUERY_LOG_PATH)
-  if os.path.isfile(HITS_LOG_JSON_PATH):
-    hist_json = open(HITS_LOG_JSON_PATH, 'r')
-    hit_log_hist = json.load(hist_json)
-    hist_json.close()
-
-    hist_json = open(HITS_LOG_JSON_PATH, 'w')
-    json.dump(merge_logs(hit_log, hit_log_hist), hist_json)
-    hist_json.close()
-  else:
-    try:
-      hist_json = open(HITS_LOG_JSON_PATH, 'w')
-    except IOError:
-      print 'unable to open ' + HITS_LOG_JSON_PATH
-      raise
-    else:
-      json.dump(hit_log, hist_json)
-      hist_json.close()
diff --git a/cache_warming/log_hits_test.py b/cache_warming/log_hits_test.py
deleted file mode 100644
index 26944ce..0000000
--- a/cache_warming/log_hits_test.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/python
-"""Tests for log_hits.py."""
-
-import os
-import log_hits
-from wvtest import wvtest
-
-
-@wvtest.wvtest
-def testProcessLine_firstHit():
-  line = '123456789 www.yahoo.com\n'
-  expected = {'www.yahoo.com': (1, '123456789')}
-  actual = log_hits.process_line({}, line)
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testProcessLine_updateHitCount():
-  line = '123456789 www.yahoo.com\n'
-  log = {'www.yahoo.com': (1, '123456789')}
-  expected = 2
-  actual = log_hits.process_line(log, line)['www.yahoo.com'][0]
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testProcessLine_updateRecentHitTime():
-  line = '123456789 www.yahoo.com\n'
-  log = {'www.yahoo.com': (1, '987654321')}
-  expected = '123456789'
-  actual = log_hits.process_line(log, line)['www.yahoo.com'][1]
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_emptyLog():
-  hist = {'www.yahoo.com': (1, '123456789')}
-  expected = hist
-  actual = log_hits.merge_logs({}, hist)
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_emptyHist():
-  log = {'www.yahoo.com': (1, '123456789')}
-  expected = log
-  actual = log_hits.merge_logs(log, {})
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_bothEmpty():
-  expected = {}
-  actual = log_hits.merge_logs({}, {})
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_noOverlap():
-  log = {'www.yahoo.com': (1, '123456789')}
-  hist = {'www.google.com': (1, '123456789')}
-  expected = {
-      'www.yahoo.com': (1, '123456789'),
-      'www.google.com': (1, '123456789')
-  }
-  actual = log_hits.merge_logs(log, hist)
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_updateHitCount():
-  log = {'www.yahoo.com': (1, '987654321')}
-  hist = {'www.yahoo.com': (1, '123456789')}
-  expected = 2
-  actual = log_hits.merge_logs(log, hist)['www.yahoo.com'][0]
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_updateRecentHitTime():
-  log = {'www.yahoo.com': (1, '987654321')}
-  hist = {'www.yahoo.com': (1, '123456789')}
-  expected = '987654321'
-  actual = log_hits.merge_logs(log, hist)['www.yahoo.com'][1]
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_histLargerNoOverlap():
-  log = {'www.yahoo.com': (1, '123456789')}
-  hist = {
-      'www.google.com': (1, '123456789'),
-      'www.espn.com': (1, '123456789')
-  }
-  expected = {
-      'www.yahoo.com': (1, '123456789'),
-      'www.google.com': (1, '123456789'),
-      'www.espn.com': (1, '123456789')
-  }
-  actual = log_hits.merge_logs(log, hist)
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_histLargerUpdateHitCount():
-  log = {'www.yahoo.com': (1, '987654321')}
-  hist = {
-      'www.yahoo.com': (1, '123456789'),
-      'www.google.com': (1, '123456789')
-  }
-  expected = 2
-  actual = log_hits.merge_logs(log, hist)['www.yahoo.com'][0]
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testMergeLogs_histLargerUpdateRecentHitTime():
-  log = {'www.yahoo.com': (1, '987654321')}
-  hist = {
-      'www.yahoo.com': (1, '123456789'),
-      'www.google.com': (1, '123456789')
-  }
-  expected = '987654321'
-  actual = log_hits.merge_logs(log, hist)['www.yahoo.com'][1]
-  wvtest.WVPASSEQ(actual, expected)
-
-
-@wvtest.wvtest
-def testReadDNSQueryLog_empty():
-  file_name = 'test_log.txt'
-  open(file_name, 'w').close()
-  expected = {}
-  actual = log_hits.read_dns_query_log(file_name)
-  wvtest.WVPASSEQ(actual, expected)
-  os.remove(file_name)
-
-
-@wvtest.wvtest
-def testReadDNSQueryLog_nonEmpty():
-  file_name = 'test_log.txt'
-  f = open(file_name, 'w')
-  f.write('123456789 www.yahoo.com\n987654321 www.google.com\n'
-          '135792468 www.yahoo.com\n')
-  f.close()
-  expected = {
-      'www.yahoo.com': (2, '135792468'),
-      'www.google.com': (1, '987654321')
-  }
-  actual = log_hits.read_dns_query_log(file_name)
-  wvtest.WVPASSEQ(actual, expected)
-  os.remove(file_name)
-
-
-@wvtest.wvtest
-def testClearDNSQueryLog():
-  file_name = 'test_log.txt'
-  f = open(file_name, 'w')
-  f.write('testing clear_dns_query_log()\n')
-  f.close()
-
-  log_hits.clear_dns_query_log(file_name)
-  expected = 0
-  actual = os.stat(file_name).st_size
-  wvtest.WVPASSEQ(actual, expected)
-  os.remove(file_name)
-
-
-if __name__ == '__main__':
-  wvtest.wvtest_main()
diff --git a/cache_warming/warm_cache b/cache_warming/warm_cache
deleted file mode 100644
index fa334d0..0000000
--- a/cache_warming/warm_cache
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/sh
-
-# Periodically processes logged DNS queries and prefetches
-# prefetches the most requested hosts, warming the cache.
-
-while sleep 60; do
-  python log_hits.py
-  python fetch_popular.py
-done