The S3 modular input doesn't check to see if IsTruncated is set in the bucket listing and the use a marker to continue. This has the effect of limiting the listing to the first 1000 objects. I made a quick modification to work around this, but the plugin has other major issues that make it unsuitable for production use so I decided to set use s3cmd in sync mode to just grab the logs and let Splunk index the files.
Here's the mod in case anyone is interested:
def get_objs_from_bucket(key_id, secret_key, bucket, subdir = None):
more_data = True
marker = None
objs = []
while more_data:
query_string = "?marker=%s" % (marker)
if subdir:
query_string = "?marker=%s&prefix=%s&delimiter=/" % (marker, urllib.quote(subdir))
conn = get_http_connection(key_id, secret_key, bucket, obj = None, query_string = query_string)
resp = conn.getresponse()
log_response(resp)
if resp.status != 200:
raise Exception, "AWS HTTP request return status code %d (%s): %s" % \
(resp.status, resp.reason, get_amazon_error(resp.read()))
bucket_listing = resp.read()
conn.close()
# parse AWS's bucket listing response
doc = xml.dom.minidom.parseString(bucket_listing)
root = doc.documentElement
key_nodes = root.getElementsByTagName("Key")
for key in key_nodes:
if key.firstChild.nodeType == key.firstChild.TEXT_NODE:
objs.append(key.firstChild.data)
if root.getElementsByTagName("IsTruncated")[0].firstChild.data == "true":
marker = objs[-1]
logging.info("found %d objects so far..." % (len(objs)))
else:
more_data = False
logging.info("found %d objects total..." % (len(objs)))
return objs
... View more