Improve data type hierarchy, add documentation
This commit is contained in:
		@@ -67,12 +67,19 @@ class CSVImporter:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def assume_data_types(self):
 | 
					    def assume_data_types(self):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Assume the data types of the rows in the csv file based on the given values. Check the first 100 values, so the
 | 
					        Assume the data types of the rows in the csv file based on the given values. Check the first 10000 values, so
 | 
				
			||||||
        overhead is small, but the check data is large enough to get a correct assumption. The supported data types for
 | 
					        the overhead is small, but the check data is large enough to get a correct assumption. The supported data types
 | 
				
			||||||
        assuming are NULL, INT, DECIMAL and TEXT.
 | 
					        for assuming are NULL, INT, DECIMAL and TEXT.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        check_limit = len(self.csv_data) - 2
 | 
					        # Check the of the csv data without the header.
 | 
				
			||||||
 | 
					        if len(self.csv_data) - 2 > 10000:
 | 
				
			||||||
 | 
					            # Set the limit to 10000 in case of a larger file.
 | 
				
			||||||
 | 
					            check_limit = 10000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Set the limit to the data size of the file.
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            check_limit = len(self.csv_data) - 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Create a list for the data types.
 | 
					        # Create a list for the data types.
 | 
				
			||||||
        self.data_types = [None] * len(self.csv_data[0])
 | 
					        self.data_types = [None] * len(self.csv_data[0])
 | 
				
			||||||
@@ -84,17 +91,19 @@ class CSVImporter:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            # Check the data type of the current column.
 | 
					            # Check the data type of the current column.
 | 
				
			||||||
            for check_column in range(len(current_row)):
 | 
					            for check_column in range(len(current_row)):
 | 
				
			||||||
 | 
					                # Get the old/previous data type for comparison.
 | 
				
			||||||
 | 
					                old_data_type = self.data_types[check_column]
 | 
				
			||||||
                # If the data type is TEXT, break, because there is nothing to change. This data type works in every
 | 
					                # If the data type is TEXT, break, because there is nothing to change. This data type works in every
 | 
				
			||||||
                # case.
 | 
					                # case.
 | 
				
			||||||
                if self.data_types[check_column] != "TEXT":
 | 
					                if old_data_type != "TEXT":
 | 
				
			||||||
                    # Get the current value.
 | 
					                    # Get the current value.
 | 
				
			||||||
                    value = current_row[check_column]
 | 
					                    value = current_row[check_column]
 | 
				
			||||||
                    # Get the data type of the current value.
 | 
					                    # Get the data type of the current value.
 | 
				
			||||||
                    data_type = self.get_data_type(value)
 | 
					                    data_type = self.get_data_type(value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    # If the data type is not null, write the data type in the data type list. # TODO: Debug this data
 | 
					                    # If the data type is not null, write the data type in the data type list. Converting REAL to INT is
 | 
				
			||||||
                    #   type shit
 | 
					                    # not allowed.
 | 
				
			||||||
                    if data_type != "NULL" or (self.data_types[check_column] != "REAL" and data_type == "INT"):
 | 
					                    if data_type != "NULL" and (not (old_data_type == "REAL" and data_type == "INT")):
 | 
				
			||||||
                        self.data_types[check_column] = data_type
 | 
					                        self.data_types[check_column] = data_type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_data_type(self, value):
 | 
					    def get_data_type(self, value):
 | 
				
			||||||
@@ -197,60 +206,99 @@ class CSVImporter:
 | 
				
			|||||||
            # Check the name of the table.
 | 
					            # Check the name of the table.
 | 
				
			||||||
            self.table_name = self.check_ddl_parameter(self.table_name)
 | 
					            self.table_name = self.check_ddl_parameter(self.table_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def create_insert_queries(self):
 | 
					    def create_and_execute_insert_queries(self):
 | 
				
			||||||
        # TODO: docu
 | 
					        """
 | 
				
			||||||
 | 
					        Create the necessary queries for inserting the data in the table based on splitting the data in sub lists for
 | 
				
			||||||
 | 
					        improving the performance of the insert. The attribute of the class for the csv data should not be harmed, so
 | 
				
			||||||
 | 
					        a copy is used for the working process. Execute the queries after their creation.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Copy the data list, so the work data list can be used and modified.
 | 
				
			||||||
        work_data_list = copy.copy(self.csv_data)
 | 
					        work_data_list = copy.copy(self.csv_data)
 | 
				
			||||||
 | 
					        # Delete the header, because the header does not have to be inserted.
 | 
				
			||||||
        del work_data_list[0]
 | 
					        del work_data_list[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Define a chunk size for separation the work data list in those chunks. 5000 is an acceptable value between the
 | 
				
			||||||
 | 
					        # two extreme cases (inserting all data in one INSERT or inserting every row/list with an own INSERT).
 | 
				
			||||||
        chunk_size = 5000
 | 
					        chunk_size = 5000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Split the work data list in lists with the given chunk size.
 | 
				
			||||||
        work_data_list = [work_data_list[i * chunk_size:(i+1) * chunk_size]
 | 
					        work_data_list = [work_data_list[i * chunk_size:(i+1) * chunk_size]
 | 
				
			||||||
                          for i in range((len(work_data_list) + chunk_size - 1) // chunk_size)]
 | 
					                          for i in range((len(work_data_list) + chunk_size - 1) // chunk_size)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Iterate over every sub list in the work data list. Those sub lists contain their separate chunk of data for
 | 
				
			||||||
 | 
					        # inserting.
 | 
				
			||||||
        for sub_data_list in work_data_list:
 | 
					        for sub_data_list in work_data_list:
 | 
				
			||||||
 | 
					            # Get the begin of an insert query.
 | 
				
			||||||
            insert_query = self.create_insert_query_begin()
 | 
					            insert_query = self.create_insert_query_begin()
 | 
				
			||||||
 | 
					            # Define a list for the parameters, because the data is used as parameter in the query.
 | 
				
			||||||
            parameter_list = []
 | 
					            parameter_list = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Get one single row, so this row (count) describes exactly one row of data.
 | 
				
			||||||
            for row_count in range(len(sub_data_list)):
 | 
					            for row_count in range(len(sub_data_list)):
 | 
				
			||||||
 | 
					                # Begin the query with ( for the correct SQL syntax.
 | 
				
			||||||
                value_query = "("
 | 
					                value_query = "("
 | 
				
			||||||
 | 
					                # Get the row with the row count.
 | 
				
			||||||
                row = sub_data_list[row_count]
 | 
					                row = sub_data_list[row_count]
 | 
				
			||||||
 | 
					                # Iterate over the value count, so it is now possible to get every single value.
 | 
				
			||||||
                for value_count in range(len(row)):
 | 
					                for value_count in range(len(row)):
 | 
				
			||||||
 | 
					                    # Check the comma value: an INSERT needs commas for separating the values, but only, if a new value
 | 
				
			||||||
 | 
					                    # follows, which is the case for every value except the last one.
 | 
				
			||||||
                    if value_count != len(row)-1:
 | 
					                    if value_count != len(row)-1:
 | 
				
			||||||
                        comma_value = ", "
 | 
					                        comma_value = ", "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    else:
 | 
					                    else:
 | 
				
			||||||
                        comma_value = ""
 | 
					                        comma_value = ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    # Put the value query together. "%s" is used as placeholder for the parameter.
 | 
				
			||||||
                    value_query = "{}%s{}".format(value_query, comma_value)
 | 
					                    value_query = "{}%s{}".format(value_query, comma_value)
 | 
				
			||||||
 | 
					                    # Get the actual value.
 | 
				
			||||||
                    value = row[value_count]
 | 
					                    value = row[value_count]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    # If the value is equal to the predefined null value/type, the value is set to None. This is
 | 
				
			||||||
 | 
					                    # interpreted as NULL by psycopg2.
 | 
				
			||||||
                    if value == self.null_type:
 | 
					                    if value == self.null_type:
 | 
				
			||||||
                        value = None
 | 
					                        value = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    # Append the value to the list of parameters.
 | 
				
			||||||
                    parameter_list.append(value)
 | 
					                    parameter_list.append(value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # If the current row is not the last one in the sub data list, append a comma for separating the
 | 
				
			||||||
 | 
					                # different value lists per insert.
 | 
				
			||||||
                if row_count != len(sub_data_list)-1:
 | 
					                if row_count != len(sub_data_list)-1:
 | 
				
			||||||
                    comma_value = ", "
 | 
					                    comma_value = ", "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # If the value is the last one, use a semicolon for the end of the query.
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    comma_value = ";"
 | 
					                    comma_value = ";"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # Put the value query together.
 | 
				
			||||||
                value_query = "{}){}".format(value_query, comma_value)
 | 
					                value_query = "{}){}".format(value_query, comma_value)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # Combine the insert query with the value query.
 | 
				
			||||||
                insert_query = "{}{}".format(insert_query, value_query)
 | 
					                insert_query = "{}{}".format(insert_query, value_query)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Execute the insert query.
 | 
				
			||||||
            self.execute_insert_query(insert_query, parameter_list)
 | 
					            self.execute_insert_query(insert_query, parameter_list)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def execute_insert_query(self, insert_query, insert_parameters):
 | 
					    def execute_insert_query(self, insert_query, insert_parameters):
 | 
				
			||||||
        # TODO: docu
 | 
					        """
 | 
				
			||||||
 | 
					        Get the query and parameters for an insert and execute it with the database query executor.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.database_query_executor.database_query = insert_query
 | 
					        self.database_query_executor.database_query = insert_query
 | 
				
			||||||
        self.database_query_executor.database_query_parameter = insert_parameters
 | 
					        self.database_query_executor.database_query_parameter = insert_parameters
 | 
				
			||||||
        self.database_query_executor.submit_and_execute_query()
 | 
					        self.database_query_executor.submit_and_execute_query()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def create_insert_query_begin(self):
 | 
					    def create_insert_query_begin(self):
 | 
				
			||||||
        # TODO: docu
 | 
					        """
 | 
				
			||||||
 | 
					        Create the begin of an insert query.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Begin with the INSERT INTO and the checked table name, so an formatted input is okay.
 | 
				
			||||||
        insert_query = "INSERT INTO {} (".format(self.table_name)
 | 
					        insert_query = "INSERT INTO {} (".format(self.table_name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Get the header for the column names.
 | 
				
			||||||
        header = self.csv_data[0]
 | 
					        header = self.csv_data[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for column_count in range(len(header)):
 | 
					        for column_count in range(len(header)):
 | 
				
			||||||
@@ -261,11 +309,14 @@ class CSVImporter:
 | 
				
			|||||||
            else:
 | 
					            else:
 | 
				
			||||||
                comma_value = ""
 | 
					                comma_value = ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Insert the column name.
 | 
				
			||||||
            insert_column = "{}{}".format(header[column_count], comma_value)
 | 
					            insert_column = "{}{}".format(header[column_count], comma_value)
 | 
				
			||||||
            insert_query = "{}{}".format(insert_query, insert_column)
 | 
					            insert_query = "{}{}".format(insert_query, insert_column)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Put the query together and append the VALUES, so the next step is adding the data for inserting.
 | 
				
			||||||
        insert_query = "{}) VALUES ".format(insert_query)
 | 
					        insert_query = "{}) VALUES ".format(insert_query)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Return the begin of the query.
 | 
				
			||||||
        return insert_query
 | 
					        return insert_query
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
@@ -306,5 +357,5 @@ if __name__ == "__main__":
 | 
				
			|||||||
        csv_importer.parse_csv_file()
 | 
					        csv_importer.parse_csv_file()
 | 
				
			||||||
        csv_importer.assume_data_types()
 | 
					        csv_importer.assume_data_types()
 | 
				
			||||||
        csv_importer.get_create_statement()
 | 
					        csv_importer.get_create_statement()
 | 
				
			||||||
        csv_importer.create_insert_queries()
 | 
					        csv_importer.create_and_execute_insert_queries()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -60,7 +60,7 @@ class CSVImportDialog(QDialog):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def insert_data(self):
 | 
					    def insert_data(self):
 | 
				
			||||||
        begin = time.time()
 | 
					        begin = time.time()
 | 
				
			||||||
        self.csv_importer.create_insert_queries()
 | 
					        self.csv_importer.create_and_execute_insert_queries()
 | 
				
			||||||
        end = time.time()
 | 
					        end = time.time()
 | 
				
			||||||
        print("Runtime: {}".format(end-begin))
 | 
					        print("Runtime: {}".format(end-begin))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user