Improve data type hierarchy, add documentation

2021-02-17 12:25:03 +01:00 · 2021-02-17 12:25:03 +01:00 · 9f9fd245c0
commit 9f9fd245c0
parent f506bcbafe
2 changed files with 65 additions and 14 deletions
--- a/pygadmin/csv_importer.py
+++ b/pygadmin/csv_importer.py
@ -67,12 +67,19 @@ class CSVImporter:

    def assume_data_types(self):
        """
-        Assume the data types of the rows in the csv file based on the given values. Check the first 100 values, so the
-        overhead is small, but the check data is large enough to get a correct assumption. The supported data types for
-        assuming are NULL, INT, DECIMAL and TEXT.
+        Assume the data types of the rows in the csv file based on the given values. Check the first 10000 values, so
+        the overhead is small, but the check data is large enough to get a correct assumption. The supported data types
+        for assuming are NULL, INT, DECIMAL and TEXT.
        """

-        check_limit = len(self.csv_data) - 2
+        # Check the of the csv data without the header.
+        if len(self.csv_data) - 2 > 10000:
+            # Set the limit to 10000 in case of a larger file.
+            check_limit = 10000
+
+        # Set the limit to the data size of the file.
+        else:
+            check_limit = len(self.csv_data) - 2

        # Create a list for the data types.
        self.data_types = [None] * len(self.csv_data[0])
@ -84,17 +91,19 @@ class CSVImporter:

            # Check the data type of the current column.
            for check_column in range(len(current_row)):
+                # Get the old/previous data type for comparison.
+                old_data_type = self.data_types[check_column]
                # If the data type is TEXT, break, because there is nothing to change. This data type works in every
                # case.
-                if self.data_types[check_column] != "TEXT":
+                if old_data_type != "TEXT":
                    # Get the current value.
                    value = current_row[check_column]
                    # Get the data type of the current value.
                    data_type = self.get_data_type(value)

-                    # If the data type is not null, write the data type in the data type list. # TODO: Debug this data
-                    #   type shit
-                    if data_type != "NULL" or (self.data_types[check_column] != "REAL" and data_type == "INT"):
+                    # If the data type is not null, write the data type in the data type list. Converting REAL to INT is
+                    # not allowed.
+                    if data_type != "NULL" and (not (old_data_type == "REAL" and data_type == "INT")):
                        self.data_types[check_column] = data_type

    def get_data_type(self, value):
@ -197,60 +206,99 @@ class CSVImporter:
            # Check the name of the table.
            self.table_name = self.check_ddl_parameter(self.table_name)

-    def create_insert_queries(self):
-        # TODO: docu
+    def create_and_execute_insert_queries(self):
+        """
+        Create the necessary queries for inserting the data in the table based on splitting the data in sub lists for
+        improving the performance of the insert. The attribute of the class for the csv data should not be harmed, so
+        a copy is used for the working process. Execute the queries after their creation.
+        """
+
+        # Copy the data list, so the work data list can be used and modified.
        work_data_list = copy.copy(self.csv_data)
+        # Delete the header, because the header does not have to be inserted.
        del work_data_list[0]

+        # Define a chunk size for separation the work data list in those chunks. 5000 is an acceptable value between the
+        # two extreme cases (inserting all data in one INSERT or inserting every row/list with an own INSERT).
        chunk_size = 5000

+        # Split the work data list in lists with the given chunk size.
        work_data_list = [work_data_list[i * chunk_size:(i+1) * chunk_size]
                          for i in range((len(work_data_list) + chunk_size - 1) // chunk_size)]

+        # Iterate over every sub list in the work data list. Those sub lists contain their separate chunk of data for
+        # inserting.
        for sub_data_list in work_data_list:
+            # Get the begin of an insert query.
            insert_query = self.create_insert_query_begin()
+            # Define a list for the parameters, because the data is used as parameter in the query.
            parameter_list = []

+            # Get one single row, so this row (count) describes exactly one row of data.
            for row_count in range(len(sub_data_list)):
+                # Begin the query with ( for the correct SQL syntax.
                value_query = "("
+                # Get the row with the row count.
                row = sub_data_list[row_count]
+                # Iterate over the value count, so it is now possible to get every single value.
                for value_count in range(len(row)):
+                    # Check the comma value: an INSERT needs commas for separating the values, but only, if a new value
+                    # follows, which is the case for every value except the last one.
                    if value_count != len(row)-1:
                        comma_value = ", "

                    else:
                        comma_value = ""

+                    # Put the value query together. "%s" is used as placeholder for the parameter.
                    value_query = "{}%s{}".format(value_query, comma_value)
+                    # Get the actual value.
                    value = row[value_count]

+                    # If the value is equal to the predefined null value/type, the value is set to None. This is
+                    # interpreted as NULL by psycopg2.
                    if value == self.null_type:
                        value = None

+                    # Append the value to the list of parameters.
                    parameter_list.append(value)

+                # If the current row is not the last one in the sub data list, append a comma for separating the
+                # different value lists per insert.
                if row_count != len(sub_data_list)-1:
                    comma_value = ", "

+                # If the value is the last one, use a semicolon for the end of the query.
                else:
                    comma_value = ";"

+                # Put the value query together.
                value_query = "{}){}".format(value_query, comma_value)

+                # Combine the insert query with the value query.
                insert_query = "{}{}".format(insert_query, value_query)

+            # Execute the insert query.
            self.execute_insert_query(insert_query, parameter_list)

    def execute_insert_query(self, insert_query, insert_parameters):
-        # TODO: docu
+        """
+        Get the query and parameters for an insert and execute it with the database query executor.
+        """
+
        self.database_query_executor.database_query = insert_query
        self.database_query_executor.database_query_parameter = insert_parameters
        self.database_query_executor.submit_and_execute_query()

    def create_insert_query_begin(self):
-        # TODO: docu
+        """
+        Create the begin of an insert query.
+        """
+
+        # Begin with the INSERT INTO and the checked table name, so an formatted input is okay.
        insert_query = "INSERT INTO {} (".format(self.table_name)

+        # Get the header for the column names.
        header = self.csv_data[0]

        for column_count in range(len(header)):
@ -261,11 +309,14 @@ class CSVImporter:
            else:
                comma_value = ""

+            # Insert the column name.
            insert_column = "{}{}".format(header[column_count], comma_value)
            insert_query = "{}{}".format(insert_query, insert_column)

+        # Put the query together and append the VALUES, so the next step is adding the data for inserting.
        insert_query = "{}) VALUES ".format(insert_query)

+        # Return the begin of the query.
        return insert_query

    @staticmethod
@ -306,5 +357,5 @@ if __name__ == "__main__":
        csv_importer.parse_csv_file()
        csv_importer.assume_data_types()
        csv_importer.get_create_statement()
-        csv_importer.create_insert_queries()
+        csv_importer.create_and_execute_insert_queries()

--- a/pygadmin/widgets/csv_import.py
+++ b/pygadmin/widgets/csv_import.py
@ -60,7 +60,7 @@ class CSVImportDialog(QDialog):

    def insert_data(self):
        begin = time.time()
-        self.csv_importer.create_insert_queries()
+        self.csv_importer.create_and_execute_insert_queries()
        end = time.time()
        print("Runtime: {}".format(end-begin))