logstash

This is a small set of notes regarding logstash, which I’ve been using to import some log data into an ElasticSearch cluster for analysis.

Processing (and geo-referencing) custom HTTP Logs

This is the required grok pattern for my custom HTTP logs, which have a custom date format and fields for “time to first byte” and overall processing time:

input {
  stdin {}
  # cat the logs to nc localhost 3333
  tcp { 
    port => 3333
  } 
}

filter {
  grok {
    match => [
        # kudos to http://grokdebug.herokuapp.com/ for helping me test this
        "message",
        '%{IP:client} %{NOTSPACE} %{NOTSPACE} \[(?%{MONTHDAY}/%{MONTH}/%{YEAR} %{TIME})\] "%{WORD:verb} %{NOTSPACE:url} HTTP/%{NUMBER:http_version}" %{NUMBER:code} (?:%{NUMBER:bytes}|-) (?:"(?:%{URI:referrer}|-)"|%{QS:referrer}) %{QS:agent} %{NUMBER:time_firstbyte} %{NUMBER:time_processing}'
    ]
  }
  date {
    match => [
      "timestamp",
      "d/MMM/YYYY HH:mm:ss"
    ]
  }
  geoip {
      source => "client"
      target => "geoip"
      add_field => [ "[geoip][coordinates]", "%{[geoip][longitude]}" ]
      add_field => [ "[geoip][coordinates]", "%{[geoip][latitude]}"  ]
    }
    mutate {
      convert => [ "[geoip][coordinates]", "float" ]
      convert => [ "[time_firstbyte]", "float" ]
      convert => [ "[time_processing]", "float" ]
    }
}

output {
  elasticsearch {
    # our cluster is behind NAT, with a single exposed machine, so the embedded
    # elasticsearch can't join it and we need to use transport mode.
    host => "cluster.local"
    protocol => "transport"
  }
  stdout { 
    # colorized debug output -- remove for long files, increases CPU load
    codec => rubydebug 
  }
}