Extracting table and column names from SQL query
(Not original article)
sql-metadata is a Python library that uses a tokenized query returned by python-sqlparse and generates query metadata.
This metadata can return column and table names from your supplied SQL query. Here are a couple of example from the sql-metadata github readme:
>>> sql_metadata.get_query_columns("SELECT test, id FROM foo, bar") [u'test', u'id'] >>> sql_metadata.get_query_tables("SELECT test, id FROM foo, bar") [u'foo', u'bar'] >>> sql_metadata.get_query_limit_and_offset('SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 1000') (50, 1000)
Code adapted from https://grisha.org/blog/2016/11/14/table-names-from-sql/
import re def tables_in_query(sql_str): # remove the /* */ comments q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", sql_str) # remove whole line -- and # comments lines = [line for line in q.splitlines() if not re.match("^\s*(--|#)", line)] # remove trailing -- and # comments q = " ".join([re.split("--|#", line)[0] for line in lines]) # split on blanks, parens and semicolons tokens = re.split(r"[\s)(;]+", q) # scan the tokens. if we see a FROM or JOIN, we set the get_next # flag, and grab the next one (unless it's SELECT). table = set() get_next = False for tok in tokens: if get_next: if tok.lower() not in ["", "select"]: table.add(tok) get_next = False get_next = tok.lower() in ["from", "join"] dictTables = dict() for table in tables: fields = [] for token in tokens: if token.startswith(table): if token != table: fields.append(token) if len(list(set(fields))) >= 1: dictTables[table] = list(set(fields)) return dictTables
import ply.lex as lex, re tokens = ( "TABLE", "JOIN", "COLUMN", "TRASH" ) tables = {"tables": {}, "alias": {}} columns = [] t_TRASH = r"Select|on|=|;|\s+|,|\t|\r" def t_TABLE(t): r"from\s(\w+)\sas\s(\w+)" regex = re.compile(t_TABLE.__doc__) m = regex.search(t.value) if m is not None: tbl = m.group(1) alias = m.group(2) tables["tables"][tbl] = "" tables["alias"][alias] = tbl return t def t_JOIN(t): r"inner\s+join\s+(\w+)\s+as\s+(\w+)" regex = re.compile(t_JOIN.__doc__) m = regex.search(t.value) if m is not None: tbl = m.group(1) alias = m.group(2) tables["tables"][tbl] = "" tables["alias"][alias] = tbl return t def t_COLUMN(t): r"(\w+\.\w+)" regex = re.compile(t_COLUMN.__doc__) m = regex.search(t.value) if m is not None: t.value = m.group(1) columns.append(t.value) return t def t_error(t): raise TypeError("Unknown text '%s'" % (t.value,)) t.lexer.skip(len(t.value)) # here is where the magic starts def mylex(inp): lexer = lex.lex() lexer.input(inp) for token in lexer: pass result = {} for col in columns: tbl, c = col.split('.') if tbl in tables["alias"].keys(): key = tables["alias"][tbl] else: key = tbl if key in result: result[key].append(c) else: result[key] = list() result[key].append(c) print result # {'tb1': ['col1', 'col7'], 'tb2': ['col2', 'col8']} string = "Select a.col1, b.col2 from tb1 as a inner join tb2 as b on tb1.col7 = tb2.col8;" mylex(string)
References
https://stackoverflow.com/questions/35624662/how-to-extract-table-names-and-column-names-from-sql-query
https://stackoverflow.com/questions/22976019/how-to-get-table-name-of-the-executed-query-python-sqlite