Bigdata / Hadoop / Programming
0

Parsing sqoop logs for stats analysis

by robin · Published May 9, 2017 · Updated May 9, 2017

Below python code will help you extract statistics from a set of Sqoop log files for transfer analysis,

#!/usr/bin/env python
import fnmatch
import os
import datetime


def find_files(directory, pattern):
    for root, dirs, files in os.walk(directory):
        for basename in files:
            if fnmatch.fnmatch(basename, pattern):
                filename = os.path.join(root, basename)
                yield filename


def change_dt_format(dt_val):
    new_dt = datetime.datetime.strptime(dt_val, '%y/%m/%d %H:%M:%S').strftime('%m/%d/%y %H:%M:%S')
    # new_dt = dt_val
    return new_dt


def datetime_diff_minutes(start_ts, end_ts):
    fmt = '%y/%m/%d %H:%M:%S'
    start = datetime.datetime.strptime(start_ts, fmt)
    end = datetime.datetime.strptime(end_ts, fmt)
    return str((end-start).seconds/60)


start_dir = '/Users/robin/Downloads/EDW'
file_pattern = '*.log'


block_begin = 'Importing from Teradata Table:'
block_end = 'Teradata import job completed with exit code'
file_bytes_read = 'FILE: Number of bytes read='
file_bytes_written = 'FILE: Number of bytes written='
hdfs_bytes_read = 'HDFS: Number of bytes read='
hdfs_bytes_written = 'HDFS: Number of bytes written='
map_input_rec = 'Map input records='
map_output_rec = 'Map output records='
time_maps_in_oslots = 'Total time spent by all maps in occupied slots (ms)='
total_column_count_oneless = 9;

timestamp_start = ""
timestamp_end = ""

# print headers below
print 'State\tStart Time\tTable name\tfile_bytes_read\tfile_bytes_written\thdfs_bytes_read\thdfs_bytes_written\tMap time spent\tMap In Records\tMap Out Records\tEnd Time\tTime Diff(mins)\tFilename'

the_record = ''
for filename in find_files(start_dir, file_pattern):
    f = open(filename, 'r')
    for line in f:
        pos = line.find(block_begin)
        if pos != -1:
            # the_record = "Start: "
            # the_record += line[:17]
            the_record += change_dt_format(line[:17])
            timestamp_start = line[:17]
            the_record += "\t"
            the_record += line[pos+len(block_begin):-1]
            # print the_record

        pos = line.find(file_bytes_read)
        if pos != -1:
            the_record += "\t"
            the_record += line[pos+len(file_bytes_read):-1]

        pos = line.find(file_bytes_written)
        if pos != -1:
            the_record += "\t"
            the_record += line[pos+len(file_bytes_written):-1]

        pos = line.find(hdfs_bytes_read)
        if pos != -1:
            the_record += "\t"
            the_record += line[pos+len(hdfs_bytes_read):-1]

        pos = line.find(hdfs_bytes_written)
        if pos != -1:
            the_record += "\t"
            the_record += line[pos+len(hdfs_bytes_written):-1]

        pos = line.find(map_input_rec)
        if pos != -1:
            the_record += "\t"
            the_record += line[pos+len(map_input_rec):-1]

        pos = line.find(map_output_rec)
        if pos != -1:
            the_record += "\t"
            the_record += line[pos+len(map_output_rec):-1]

        pos = line.find(time_maps_in_oslots)
        if pos != -1:
            the_record += "\t"
            the_record += line[pos+len(time_maps_in_oslots):-1]

        pos = line.find(block_end)
        if pos != -1:
            the_record += "\t"
            # the_record += line[:17]
            the_record += change_dt_format(line[:17])
            timestamp_end = line[:17]
            if the_record.count("\t") == total_column_count_oneless:
                print "Good\t" + the_record + "\t" + datetime_diff_minutes(timestamp_start, timestamp_end) + "\t" + filename
            # else:
                # print "Bad\t" + the_record + "\t" + filename
            # the_record = ''
            the_record = timestamp_end = timestamp_start = ''

Tags: bigdata hadoop programming python

You may also like...

Leave a Reply Cancel reply