Refactoring and documentation of the csv importer

State of the art: CREATE TABLE with check of the given columns for
mallicious content
This commit is contained in:
Lea Laux 2021-02-16 10:06:04 +01:00 committed by KDV Admin
parent 7e37ac06b7
commit ec53065798

View File

@ -14,28 +14,23 @@ class CSVImporter:
assumed data types and inserting all the data.
"""
def __init__(self, database_connection, csv_file, delimiter=",", null_type="NULL", create_table=True,
table_name=None):
# Use the given database connection for further execution of database queries on the given database.
self.database_connection = database_connection
def __init__(self, database_connection, csv_file, delimiter=",", null_type="NULL", table_name=None):
# Use the csv file for loading the relevant data.
self.csv_file = csv_file
# Use a delimiter for the csv file, not necessarily ",".
self.delimiter = delimiter
# Define the null type of the csv file.
self.null_type = null_type
# Use a new table for saving the data and not only appending to an existing one.
self.create_table = create_table
# Get the name of the table.
self.table_name = table_name
# Save the csv data in a list.
self.csv_data = []
# Save the data types in a list.
self.data_types = []
# TODO: Create functions for connecting the result signal and the error signal
# Use the database query executor for executing the create table and insert queries.
self.database_query_executor = DatabaseQueryExecutor()
# Use the given database connection for further execution of database queries on the given database.
self.database_query_executor.database_connection = database_connection
self.database_query_executor.error.connect(self.print_error)
self.database_query_executor.result_data.connect(self.print_result)
@ -113,81 +108,120 @@ class CSVImporter:
self.data_types[check_column] = data_type
def get_data_type(self, value):
"""
Get the data type of a specific value in a readable format as Postgres data type. Every value can be a text in
the end.
"""
# If the value is the predefined null value/type, return a NULL value. Every other data type is still possible.
if value == self.null_type:
return "NULL"
# Try to cast the value to a float or an integer to check for a number.
try:
# Try to float the value and check for an integer.
if float(value).is_integer():
return "INT"
float(value)
# Only a float/REAL is a possible option now.
return "REAL"
# Ignore the value error, because it is not relevant in this case and allowed to go silently.
except ValueError:
pass
# Return TEXT, if a match could not be made.
return "TEXT"
def create_table_for_csv_data(self):
if self.create_table is not True:
return
"""
Create the table to store the csv data in the database.
"""
# Get the create statement of the table.
create_statement = self.get_create_statement()
with self.database_connection.cursor() as database_cursor:
database_cursor.execute(sql.SQL(create_statement))
# Assign the create statement as query to the table.
self.database_query_executor.database_query = create_statement
# Execute!
self.database_query_executor.submit_and_execute_query()
if database_cursor.description:
print(database_cursor.description)
def get_create_statement(self, check_ddl=True):
"""
Build the CREATE statement for the table for inserting the csv data. The option "check_ddl" secures the column
name against possible dangerous names. The default is True, because such checks after "blind" imports are a
security feature.
"""
def get_create_statement(self):
# Get the table name, so the table name can be used in the create statement.
self.get_table_name()
# Add the table name to the query.
create_table_query = "CREATE TABLE {} (".format(self.table_name)
# Get the header as start of the csv data, because the columns are defined here.
header = self.csv_data[0]
# Iterate over the header. The column count is necessary to set the comma value at the correct point, because
# the column to create does not need a comma at the end.
for column_count in range(len(header)):
# Define the comma to set after the definition of the column.
if column_count != len(header)-1:
comma_value = ","
else:
comma_value = ""
create_column = "{} {}{}\n".format(header[column_count], self.data_types[column_count], comma_value)
# Get the current column.
current_column = header[column_count]
# If the name of the column should be checked, check it.
if check_ddl:
current_column = self.check_ddl_parameter(current_column)
# Define the current column with its name, its data type and its comma value.
create_column = "{} {}{}\n".format(current_column, self.data_types[column_count], comma_value)
# Build the table query by adding the new column.
create_table_query = "{}{}".format(create_table_query, create_column)
# End the query, so the query is in an executable format.
create_table_query = "{});".format(create_table_query)
print(create_table_query)
# TODO: Mechanism for the user to check the create table statement
return create_table_query
def get_table_name(self):
"""
Get the name of the table based on the name of the csv file, if the name is not specified by the user.
"""
if self.table_name is None:
# Split the csv file in the different parts of the path.
slash_split_list = self.csv_file.split("/")
# Get the last part of the list as file name, because the last part of the path is the file identifier.
self.table_name = slash_split_list[len(slash_split_list) - 1]
# Get the .csv out of the name by splitting.
csv_split_list = self.table_name.split(".csv")
# Use the part without csv as table name.
self.table_name = csv_split_list[0]
# Check the name of the table.
self.table_name = self.check_ddl_parameter(self.table_name)
@staticmethod
def check_ddl_parameter(parameter):
parameter = re.sub(r"[^a-zA-Z0-9 _\.]", "", parameter)
"""
Check the given data definition language parameter for potentially malicious characters. Those malicious
characters could cause an SQL injection, so nearly all special characters are kicked out.
"""
# Define a matching regular expression for allowed characters. Kick every other character. Allowed special
# characters are _, whitespace, . and some special (german) characters, because they can not do any harm.
parameter = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9 _\.]", "", parameter)
# If a whitespace is part of the parameter, use " " around the parameter to make a valid column ddl parameter.
if " " in parameter:
parameter = '"{}"'.format(parameter)
return parameter
def print_error(self, error):
print(error)
def print_result(self, result):
print(result)
def do_all_the_stuff(self):
"""
Normal persons would call this function "main". This function is only a placeholder to remember me, that I'm
@ -209,3 +243,4 @@ if __name__ == "__main__":
csv_importer.parse_csv_file()
csv_importer.assume_data_types()
csv_importer.get_create_statement()