Skip to content

QATCH Generate

TestGenerator

The interface to connect the MultipleDatabase with the SQL generators. Use this class to generate queries and questions from the databases.

Attributes:

Name Type Description
databases MultipleDatabases

The MultipleDatabases object representing the database connections.

Source code in qatch/test_generator.py
class TestGenerator:
    """
    The interface to connect the MultipleDatabase with the SQL generators.
    Use this class to generate queries and questions from the databases.

    Attributes:
        databases (MultipleDatabases): The MultipleDatabases object representing the database connections.
    """

    def __init__(self, databases: MultipleDatabases):
        self.databases = databases

        self._generators = {'select': SelectGenerator,
                            'orderby': OrderByGenerator,
                            'distinct': DistinctGenerator,
                            'where': WhereGenerator,
                            'groupby': GroupByGenerator,
                            'having': HavingGenerator,
                            'simpleAgg': SimpleAggGenerator,
                            'nullCount': NullGenerator,
                            'join': JoinGenerator}

    def generate(self,
                 generators: list[str] | str | None = None,
                 db_names: str | list[str] | None = None,
                 seed=2023
                 ) -> pd.DataFrame:
        """
        Generate test queries and questions for specified generators and databases.

        Args:
            generators (list[str] | str | None): Optional. A list of generator names to be used.
                                                  Default is to use all available generators
                                                  ['select', 'orderby', 'distinct', 'where', 'groupby',
                                                   'having', 'simpleAgg', 'nullCount'].
            db_names (str | list[str] | None): Optional. The name or list of names of databases to generate tests for.
                                                Default is to use all available databases.
            seed (int): Optional. Seed value for randomization. Default is 2023.

        Returns:
            pd.DataFrame: A DataFrame containing generated test queries, questions, and related information.

        Examples:
            Given a MultipleDatabases object "database", with three databases 'sakila', 'world', and 'employees'
            >>> generator = TestGenerator(databases)
            >>> tests_df = generator.generate(generators=['select', 'orderby'], db_names=['sakila', 'world'])
            generate tests only for select and orderby generators, and only for sakila and world databases
        """
        # TODO possible change of the db_name, add dictionary to specify also the tbl_names
        generators, db_names, = self._init_params(generators, db_names)

        tests_df_list = []
        for db_name in tqdm(db_names, desc='Generating test for each database'):
            # for each db_name
            for generator in generators:
                # init generator
                db = self.databases[db_name]
                generator = self._generators[generator](db, seed)
                for tbl in db.table_names:
                    sql_generated = generator.sql_generate(tbl)
                    df = self._build_df(db_name, tbl, sql_generated)
                    tests_df_list.append(df)

        tests_df = pd.concat(tests_df_list, ignore_index=True)
        return tests_df

    def _init_params(self, generators: list[str] | str | None,
                     db_names: str | list[str] | None) -> tuple[list[str], list[str]]:
        """
        Validate and initialize generator names and database names.

        Args:
            generators (list[str] | str | None): The list of generator names or a single generator name.
            db_names (str | list[str] | None): The name or list of names of databases to generate tests for.

        Returns:
            tuple[list[str], list[str]]: Validated generator names and database names.
        """
        # generators check
        if generators is None:
            generators = list(self._generators.keys())
        else:
            if isinstance(generators, str):
                generators = [generators]
            for generator in generators:
                if generator not in self._generators.keys():
                    raise KeyError(f'Generators must be one of {list(self._generators.keys())}')

        # db_names check
        available_dbs = self.databases.get_db_names()
        if db_names is None:
            db_names = available_dbs
        else:
            if isinstance(db_names, str):
                db_names = [db_names]
            for db_name in db_names:
                if db_name not in available_dbs:
                    raise KeyError(f'Database name "{db_name}" must be one of {available_dbs}')
        return generators, db_names

    @staticmethod
    def _build_df(db_name: str, tbl_name: str, sql_generated: dict[str, list]) -> pd.DataFrame:
        """
        Build a DataFrame from generated SQL queries, questions, and related information.

        Args:
            db_name (str): The name of the database.
            tbl_name (str): The name of the table in the database.
            sql_generated (dict[str, list]): A dictionary containing generated SQL tags, queries, and questions.

        Returns:
            pd.DataFrame: A DataFrame containing generated test queries, questions, and related information.
        """

        sql_tags = sql_generated['sql_tags']
        queries = sql_generated['queries']
        questions = sql_generated['questions']
        return pd.DataFrame({
            'db_id': [db_name] * len(sql_tags),
            'tbl_name': [tbl_name] * len(sql_tags),
            'sql_tags': sql_tags,
            'query': queries,
            'question': questions
        })

generate(generators=None, db_names=None, seed=2023)

Generate test queries and questions for specified generators and databases.

Parameters:

Name Type Description Default
generators list[str] | str | None

Optional. A list of generator names to be used. Default is to use all available generators ['select', 'orderby', 'distinct', 'where', 'groupby', 'having', 'simpleAgg', 'nullCount'].

None
db_names str | list[str] | None

Optional. The name or list of names of databases to generate tests for. Default is to use all available databases.

None
seed int

Optional. Seed value for randomization. Default is 2023.

2023

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing generated test queries, questions, and related information.

Examples:

Given a MultipleDatabases object "database", with three databases 'sakila', 'world', and 'employees'

>>> generator = TestGenerator(databases)
>>> tests_df = generator.generate(generators=['select', 'orderby'], db_names=['sakila', 'world'])
generate tests only for select and orderby generators, and only for sakila and world databases
Source code in qatch/test_generator.py
def generate(self,
             generators: list[str] | str | None = None,
             db_names: str | list[str] | None = None,
             seed=2023
             ) -> pd.DataFrame:
    """
    Generate test queries and questions for specified generators and databases.

    Args:
        generators (list[str] | str | None): Optional. A list of generator names to be used.
                                              Default is to use all available generators
                                              ['select', 'orderby', 'distinct', 'where', 'groupby',
                                               'having', 'simpleAgg', 'nullCount'].
        db_names (str | list[str] | None): Optional. The name or list of names of databases to generate tests for.
                                            Default is to use all available databases.
        seed (int): Optional. Seed value for randomization. Default is 2023.

    Returns:
        pd.DataFrame: A DataFrame containing generated test queries, questions, and related information.

    Examples:
        Given a MultipleDatabases object "database", with three databases 'sakila', 'world', and 'employees'
        >>> generator = TestGenerator(databases)
        >>> tests_df = generator.generate(generators=['select', 'orderby'], db_names=['sakila', 'world'])
        generate tests only for select and orderby generators, and only for sakila and world databases
    """
    # TODO possible change of the db_name, add dictionary to specify also the tbl_names
    generators, db_names, = self._init_params(generators, db_names)

    tests_df_list = []
    for db_name in tqdm(db_names, desc='Generating test for each database'):
        # for each db_name
        for generator in generators:
            # init generator
            db = self.databases[db_name]
            generator = self._generators[generator](db, seed)
            for tbl in db.table_names:
                sql_generated = generator.sql_generate(tbl)
                df = self._build_df(db_name, tbl, sql_generated)
                tests_df_list.append(df)

    tests_df = pd.concat(tests_df_list, ignore_index=True)
    return tests_df

DistinctGenerator

Bases: AbstractSqlGenerator

A class for generating DISTINCT SQL queries and corresponding questions based on categorical columns of a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Source code in qatch/sql_generator/distinct_generator.py
class DistinctGenerator(AbstractSqlGenerator):
    """
    A class for generating DISTINCT SQL queries and corresponding questions based on
    categorical columns of a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generates DISTINCT SQL queries and corresponding questions based on categorical columns of a table.
        Generates two distinct tags: DISTINCT-SINGLE and DISTINCT-MULT only for the categorical columns.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, and questions.
                Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "names"
            >>> generator = DistinctGenerator(database)
            >>> generator._distinct_single_col("table_name", ["colors"])
            >>> generator.sql_generated
            {"sql_tags": ["DISTINCT-SINGLE"],
            "queries": ["SELECT DISTINCT \"colors\" FROM \"table_name\""],
            "questions": ["Show the different \"colors\" in the table table_name"]}
            >>> generator_distinct_mult_col("table_name", ["colors", "names"])
            >>> generator.sql_generated
            {"sql_tags": ["DISTINCT-MULT"],
            "queries": ["SELECT DISTINCT \"colors\", \"names\" FROM \"table_name\""],
            "questions": ["Show the different \"colors\", \"names\" in the table table_name"]}
        """
        self.empty_sql_generated()
        _, cat_cols, _ = self._sample_cat_num_cols(table_name)
        self._distinct_single_col(table_name, cat_cols)
        self._distinct_mult_col(table_name, cat_cols)
        return self.sql_generated

    def _distinct_single_col(self, table_name: str, cat_columns: list):
        """
        Generates DISTINCT SQL queries and questions for individual categorical columns.

        Args:
            table_name (str): The name of the table in the database.
            cat_columns (List[str]): List of categorical column names.
        """
        queries = [f'SELECT DISTINCT `{col}` FROM `{table_name}`'
                   for col in cat_columns]

        questions = [f'Show the different "{col}" in the table "{table_name}"'
                     for col in cat_columns]

        sql_tags = ['DISTINCT-SINGLE'] * len(queries)
        self.append_sql_generated(sql_tags=sql_tags, queries=queries, questions=questions)

    def _distinct_mult_col(self, table_name: str, cat_columns: list):
        """
        Generates DISTINCT SQL queries and questions for combinations of multiple categorical columns.

        Args:
            table_name (str): The name of the table in the database.
            cat_columns (List[str]): List of categorical column names.
        """
        combinations = self._comb_random(cat_columns)
        queries = [f'SELECT DISTINCT {self._get_col_comb_str(comb)} FROM `{table_name}`'
                   for comb in combinations]

        questions = [f'Show the different {self._get_col_comb_str(comb)} in the table "{table_name}"'
                     for comb in combinations]

        sql_tags = ['DISTINCT-MULT'] * len(queries)
        self.append_sql_generated(sql_tags=sql_tags, queries=queries, questions=questions)

sql_generate(table_name)

Generates DISTINCT SQL queries and corresponding questions based on categorical columns of a table. Generates two distinct tags: DISTINCT-SINGLE and DISTINCT-MULT only for the categorical columns.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "names"

>>> generator = DistinctGenerator(database)
>>> generator._distinct_single_col("table_name", ["colors"])
>>> generator.sql_generated
{"sql_tags": ["DISTINCT-SINGLE"],
"queries": ["SELECT DISTINCT "colors" FROM "table_name""],
"questions": ["Show the different "colors" in the table table_name"]}
>>> generator_distinct_mult_col("table_name", ["colors", "names"])
>>> generator.sql_generated
{"sql_tags": ["DISTINCT-MULT"],
"queries": ["SELECT DISTINCT "colors", "names" FROM "table_name""],
"questions": ["Show the different "colors", "names" in the table table_name"]}
Source code in qatch/sql_generator/distinct_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generates DISTINCT SQL queries and corresponding questions based on categorical columns of a table.
    Generates two distinct tags: DISTINCT-SINGLE and DISTINCT-MULT only for the categorical columns.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "names"
        >>> generator = DistinctGenerator(database)
        >>> generator._distinct_single_col("table_name", ["colors"])
        >>> generator.sql_generated
        {"sql_tags": ["DISTINCT-SINGLE"],
        "queries": ["SELECT DISTINCT \"colors\" FROM \"table_name\""],
        "questions": ["Show the different \"colors\" in the table table_name"]}
        >>> generator_distinct_mult_col("table_name", ["colors", "names"])
        >>> generator.sql_generated
        {"sql_tags": ["DISTINCT-MULT"],
        "queries": ["SELECT DISTINCT \"colors\", \"names\" FROM \"table_name\""],
        "questions": ["Show the different \"colors\", \"names\" in the table table_name"]}
    """
    self.empty_sql_generated()
    _, cat_cols, _ = self._sample_cat_num_cols(table_name)
    self._distinct_single_col(table_name, cat_cols)
    self._distinct_mult_col(table_name, cat_cols)
    return self.sql_generated

GroupByGenerator

Bases: AbstractSqlGenerator

A class for generating SQL queries and corresponding questions based on group-by operations performed on categorical and numerical columns of a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Source code in qatch/sql_generator/groupby_generator.py
class GroupByGenerator(AbstractSqlGenerator):
    """
    A class for generating SQL queries and corresponding questions based on group-by operations
    performed on categorical and numerical columns of a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generates Group By queries and corresponding questions for both categorical and numerical columns.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, and questions.
                Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
            >>> generator = GroupByGenerator(database)
            >>> generator._build_group_by_no_agg("table_name", ["colors"])
            >>> generator.sql_generated
            {
                "sql_tags": ["GROUPBY-NO-AGGR"],
                "queries": ["SELECT `colors` FROM `table_name` GROUP BY `colors`"],
                "questions": ["Show all \"colors\" in the table "table_name" for each \"colors\""]
            }
            >>> generator._build_group_by_with_count("table_name", ["colors"])
            >>> generator.sql_generated
            {
                "sql_tags": ["GROUPBY-COUNT"],
                "queries": ["SELECT `colors`, COUNT(*) FROM `table_name` GROUP BY `colors`"],
                "questions": ["For each \"colors\", count the number of rows in table "table_name""]
            }
            >>> generator._build_group_by_with_agg("table_name")
            >>> generator.sql_generated
            {
                "sql_tags": ["GROUPBY-AGG-MIN", "GROUPBY-AGG-MAX", "GROUPBY-AGG-AVG", "GROUPBY-AGG-SUM"],
                "queries": [
                    "SELECT `colors`, MIN(`numbers`) FROM `table_name` GROUP BY `colors`",
                    "SELECT `colors`, MAX(`numbers`) FROM `table_name` GROUP BY `colors`",
                    "SELECT `colors`, AVG(`numbers`) FROM `table_name` GROUP BY `colors`",
                    "SELECT `colors`, SUM(`numbers`) FROM `table_name` GROUP BY `colors`"
                ],
                "questions": [
                    "For each `colors`, find the min of `numbers` in table `table_name`",
                    "For each `colors`, find the max of `numbers` in table `table_name`",
                    "For each `colors`, find the avg of `numbers` in table `table_name`",
                    "For each `colors`, find the sum of `numbers` in table `table_name`"
                ]
            }
        """
        self.empty_sql_generated()
        df, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
        self._build_group_by_no_agg(table_name, cat_cols)
        self._build_group_by_with_count(table_name, cat_cols)
        self._build_group_by_with_agg(table_name)
        return self.sql_generated

    def _build_group_by_no_agg(self, table_name: str, cat_cols: list):
        """
        Generate group-by SQL queries and questions without aggregation
        for random combinations of categorical columns.
        The query result is the same as Distinct.

        Args:
            table_name (str): The name of the table in the database.
            cat_cols (List[str]): List of categorical columns.
        """
        random_combinations = self._comb_random(cat_cols)

        questions = [f'Show all {self._get_col_comb_str(comb)}' \
                     f' in the table "{table_name}" for each {self._get_col_comb_str(comb)}'
                     for comb in random_combinations]

        queries = [f'SELECT {self._get_col_comb_str(comb)} FROM ' \
                   f'`{table_name}` GROUP BY {self._get_col_comb_str(comb)}'
                   for comb in random_combinations]

        sql_tags = ['GROUPBY-NO-AGGR'] * len(queries)

        self.append_sql_generated(sql_tags=sql_tags, queries=queries,
                                  questions=questions)

    def _build_group_by_with_count(self, table_name: str, cat_cols: list):
        """
        Generate group-by SQL queries and questions with count aggregation for categorical columns.

        Args:
            table_name (str): The name of the table in the database.
            cat_cols (List[str]): List of categorical columns.
        """
        questions = [f'For each "{col}", count the number of rows in table "{table_name}"'
                     for col in cat_cols]
        queries = [f'SELECT `{col}`, COUNT(*) FROM `{table_name}` GROUP BY `{col}`'
                   for col in cat_cols]
        sql_tags = ['GROUPBY-COUNT'] * len(queries)

        self.append_sql_generated(sql_tags=sql_tags, queries=queries,
                                  questions=questions)

    def _build_group_by_with_agg(self, table_name: str):
        """
        Generate group-by SQL queries and questions with aggregation for numerical columns.

        Args:
            table_name (str): The name of the table in the database.
        """
        # with sample == 2 we get 4 tests for each aggregation -> 4*4 = 16 tests
        # with sample == 3 we get 9 tests for each aggregation -> 9*4 = 36 tests
        _, cat_cols, num_cols = self._sample_cat_num_cols(table_name, sample=2)
        for agg in ['min', 'max', 'avg', 'sum']:
            questions = [f'For each "{c_col}", find the {agg} of "{n_col}" in table "{table_name}"'
                         for c_col in cat_cols
                         for n_col in num_cols]

            queries = [f'SELECT `{c_col}`, {agg.upper()}(`{n_col}`) FROM `{table_name}` GROUP BY `{c_col}`'
                       for c_col in cat_cols
                       for n_col in num_cols]

            sql_tags = [f'GROUPBY-AGG-{agg.upper()}'] * len(queries)

            self.append_sql_generated(sql_tags=sql_tags, queries=queries,
                                      questions=questions)

sql_generate(table_name)

Generates Group By queries and corresponding questions for both categorical and numerical columns.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"

>>> generator = GroupByGenerator(database)
>>> generator._build_group_by_no_agg("table_name", ["colors"])
>>> generator.sql_generated
{
    "sql_tags": ["GROUPBY-NO-AGGR"],
    "queries": ["SELECT `colors` FROM `table_name` GROUP BY `colors`"],
    "questions": ["Show all "colors" in the table "table_name" for each "colors""]
}
>>> generator._build_group_by_with_count("table_name", ["colors"])
>>> generator.sql_generated
{
    "sql_tags": ["GROUPBY-COUNT"],
    "queries": ["SELECT `colors`, COUNT(*) FROM `table_name` GROUP BY `colors`"],
    "questions": ["For each "colors", count the number of rows in table "table_name""]
}
>>> generator._build_group_by_with_agg("table_name")
>>> generator.sql_generated
{
    "sql_tags": ["GROUPBY-AGG-MIN", "GROUPBY-AGG-MAX", "GROUPBY-AGG-AVG", "GROUPBY-AGG-SUM"],
    "queries": [
        "SELECT `colors`, MIN(`numbers`) FROM `table_name` GROUP BY `colors`",
        "SELECT `colors`, MAX(`numbers`) FROM `table_name` GROUP BY `colors`",
        "SELECT `colors`, AVG(`numbers`) FROM `table_name` GROUP BY `colors`",
        "SELECT `colors`, SUM(`numbers`) FROM `table_name` GROUP BY `colors`"
    ],
    "questions": [
        "For each `colors`, find the min of `numbers` in table `table_name`",
        "For each `colors`, find the max of `numbers` in table `table_name`",
        "For each `colors`, find the avg of `numbers` in table `table_name`",
        "For each `colors`, find the sum of `numbers` in table `table_name`"
    ]
}
Source code in qatch/sql_generator/groupby_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generates Group By queries and corresponding questions for both categorical and numerical columns.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
        >>> generator = GroupByGenerator(database)
        >>> generator._build_group_by_no_agg("table_name", ["colors"])
        >>> generator.sql_generated
        {
            "sql_tags": ["GROUPBY-NO-AGGR"],
            "queries": ["SELECT `colors` FROM `table_name` GROUP BY `colors`"],
            "questions": ["Show all \"colors\" in the table "table_name" for each \"colors\""]
        }
        >>> generator._build_group_by_with_count("table_name", ["colors"])
        >>> generator.sql_generated
        {
            "sql_tags": ["GROUPBY-COUNT"],
            "queries": ["SELECT `colors`, COUNT(*) FROM `table_name` GROUP BY `colors`"],
            "questions": ["For each \"colors\", count the number of rows in table "table_name""]
        }
        >>> generator._build_group_by_with_agg("table_name")
        >>> generator.sql_generated
        {
            "sql_tags": ["GROUPBY-AGG-MIN", "GROUPBY-AGG-MAX", "GROUPBY-AGG-AVG", "GROUPBY-AGG-SUM"],
            "queries": [
                "SELECT `colors`, MIN(`numbers`) FROM `table_name` GROUP BY `colors`",
                "SELECT `colors`, MAX(`numbers`) FROM `table_name` GROUP BY `colors`",
                "SELECT `colors`, AVG(`numbers`) FROM `table_name` GROUP BY `colors`",
                "SELECT `colors`, SUM(`numbers`) FROM `table_name` GROUP BY `colors`"
            ],
            "questions": [
                "For each `colors`, find the min of `numbers` in table `table_name`",
                "For each `colors`, find the max of `numbers` in table `table_name`",
                "For each `colors`, find the avg of `numbers` in table `table_name`",
                "For each `colors`, find the sum of `numbers` in table `table_name`"
            ]
        }
    """
    self.empty_sql_generated()
    df, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
    self._build_group_by_no_agg(table_name, cat_cols)
    self._build_group_by_with_count(table_name, cat_cols)
    self._build_group_by_with_agg(table_name)
    return self.sql_generated

HavingGenerator

Bases: AbstractSqlGenerator

A class for generating HAVING SQL queries and corresponding questions based on a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Source code in qatch/sql_generator/having_generator.py
class HavingGenerator(AbstractSqlGenerator):
    """
    A class for generating HAVING SQL queries and corresponding questions based on a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generate HAVING SQL queries and corresponding questions for categorical and numerical columns.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, and questions.
                Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
            and (i) the average number of rows for each category in "colors" is 2
            (ii) the mean of the average numbers for each category is 5
            (iii) the average sum of numbers for each category is 10:
            >>> generator = HavingGenerator(database)
            >>> generator._build_having_count(table_name, ["colors"], df)
            >>> generator.sql_generated
            {
                "sql_tags": ["HAVING-COUNT-GR", "HAVING-COUNT-LS", "HAVING-COUNT-EQ"],
                "queries": [
                    'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) >= 2',
                    'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) <= 2',
                    'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) = 2'
                ],
                "questions": [
                    'Find all the `colors` that have at least 2 records in table `table_name`',
                    'Find all the `colors` that have at most 2 records in table `table_name`',
                    'Find all the `colors` that have exactly 2 records in table `table_name`'
                ]
            }
            >>> generator._build_having_agg(table_name, ["colors"], ["numbers"], df)
            >>> generator.sql_generated
            {
                "sql_tags": ["HAVING-AGG-AVG-GR", "HAVING-AGG-AVG-LS", "HAVING-AGG-SUM-GR", "HAVING-AGG-SUM-LS"],
                "queries": [
                    'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING AVG(`numbers`) >= 5.0',
                    'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING AVG(`numbers`) <= 5.0',
                    'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING SUM(`numbers`) >= 10.0',
                    'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING SUM(`numbers`) <= 10.0'
                ],
                "questions": [
                    'List the `colors` which average `numbers` is at least 5.0 in table `table_name`',
                    'List the `colors` which average `numbers` is at most 5.0 in table `table_name`',
                    'List the `colors` which summation of `numbers` is at least 5.0 in table `table_name`',
                    'List the `colors` which summation of `numbers` is at most 5.0 in table `table_name`'
                ]
            }
        """
        self.empty_sql_generated()
        df, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
        self._build_having_count(table_name, cat_cols, df)
        self._build_having_agg(table_name, cat_cols, num_cols, df)
        return self.sql_generated

    def _build_having_count(self, table_name: str, cat_cols: list, df: pd.DataFrame):
        """
        Build HAVING SQL queries and questions for categorical columns based on row counts.

        Args:
            table_name (str): The name of the table in the database.
            cat_cols (list): List of categorical columns.
            df (pd.DataFrame): The DataFrame containing the data.
        """

        for cat_col in cat_cols:
            # get a mean count of the category cat_col
            mean_count = self._get_average_of_count_cat_col(table_name, cat_col)
            # int(df.groupby(cat_col).count().mean().values[0])
            queries = [
                f"""SELECT `{cat_col}` FROM `{table_name}` GROUP BY `{cat_col}` HAVING count(*) >= {mean_count}""",
                f"""SELECT `{cat_col}` FROM `{table_name}` GROUP BY `{cat_col}` HAVING count(*) <= {mean_count}""",
                f"""SELECT `{cat_col}` FROM `{table_name}` GROUP BY `{cat_col}` HAVING count(*) = {mean_count}"""
            ]

            questions = [
                f'Find all the "{cat_col}" that have at least {mean_count} records in table "{table_name}"',
                f'Find all the "{cat_col}" that have at most {mean_count} records in table "{table_name}"',
                f'Find all the "{cat_col}" that have exactly {mean_count} records in table "{table_name}"'
            ]

            sql_tags = ['HAVING-COUNT-GR', 'HAVING-COUNT-LS', 'HAVING-COUNT-EQ']

            self.append_sql_generated(sql_tags, queries, questions)

    def _build_having_agg(self, table_name: str, cat_cols: list, num_cols: list, df: pd.DataFrame):
        """
        Build HAVING SQL queries and questions for numerical columns based on aggregations.

        Args:
            table_name (str): The name of the table in the database.
            cat_cols (list): List of categorical columns.
            num_cols (list): List of numerical columns.
            df (pd.DataFrame): The DataFrame containing the data.
        """
        # with sample == 2 we get 4 tests for each aggregation -> 4*4 = 16 tests
        # with sample == 3 we get 9 tests for each aggregation -> 9*4 = 36 tests
        for cat_col in cat_cols:
            # the mean for each grouped category
            # mean_sum = df.groupby(cat_col).sum(numeric_only=True)
            # mean_mean = df.groupby(cat_col).mean(numeric_only=True)
            for num_col in num_cols:
                # the mean of sum for the grouped category
                # mean_mean_sum = round(mean_sum[num_col].mean(), 2)
                # mean_mean_mean = round(mean_mean[num_col].mean(), 2)
                mean_mean_sum, mean_mean_mean = self._get_average_of_sum_avg_cat_col(table_name, cat_col, num_col)
                queries = [
                    f"""SELECT `{cat_col}` FROM `{table_name}` GROUP BY `{cat_col}` HAVING AVG(`{num_col}`) >= {mean_mean_mean}""",
                    f"""SELECT `{cat_col}` FROM `{table_name}` GROUP BY `{cat_col}` HAVING AVG(`{num_col}`) <= {mean_mean_mean}""",
                    f"""SELECT `{cat_col}` FROM `{table_name}` GROUP BY `{cat_col}` HAVING SUM(`{num_col}`) >= {mean_mean_sum}""",
                    f"""SELECT `{cat_col}` FROM `{table_name}` GROUP BY `{cat_col}` HAVING SUM(`{num_col}`) <= {mean_mean_sum}""",
                ]

                questions = [
                    f'List the "{cat_col}" which average "{num_col}" is at least {mean_mean_mean} in table "{table_name}"',
                    f'List the "{cat_col}" which average "{num_col}" is at most {mean_mean_mean} in table "{table_name}"',

                    f'List the "{cat_col}" which summation of "{num_col}" is at least {mean_mean_sum} in table "{table_name}"',
                    f'List the "{cat_col}" which summation of "{num_col}" is at most {mean_mean_sum} in table "{table_name}"',
                ]

                sql_tags = ['HAVING-AGG-AVG-GR', 'HAVING-AGG-AVG-LS',
                            'HAVING-AGG-SUM-GR', 'HAVING-AGG-SUM-LS']

                self.append_sql_generated(sql_tags, queries, questions)

    def _get_average_of_count_cat_col(self, table_name, cat_col):
        """
        Helper method to calculate the average count of rows for each category in a categorical column.

        Args:
            table_name (str): The name of the table in the database.
            cat_col (str): The name of the categorical column.

        Returns:
            int: The average count of rows for each category.
        """
        # TODO: pandas performs faster when number of tuples is 5e4 or more
        # SQL query to get the average count for each category
        inner_query = f'SELECT COUNT(*) AS row_count FROM `{table_name}` GROUP BY `{cat_col}`'
        # Run the inner query and get the average of row counts
        average = self.database.run_query(f'SELECT AVG(row_count) FROM ({inner_query})')[0][0]
        return int(average)

    def _get_average_of_sum_avg_cat_col(self, table_name, cat_col, num_col):
        """
        Helper method to calculate the average sum and average of a numerical column for each category in a categorical column.

        Args:
            table_name (str): The name of the table in the database.
            cat_col (str): The name of the categorical column.
            num_col (str): The name of the numerical column.

        Returns:
            tuple: A tuple containing the average sum and average of the numerical column for each category.
        """
        # TODO: pandas performs faster when number of tuples is 5e4 or more
        # SQL queries to get the average sum and average of numerical column for each category
        inner_query_sum = f'SELECT SUM(`{num_col}`) AS sum_col FROM `{table_name}` GROUP BY `{cat_col}`'
        inner_query_avg = f'SELECT AVG(`{num_col}`) AS avg_col FROM `{table_name}` GROUP BY `{cat_col}`'
        # Run the inner queries and get the average of sums and averages
        average_sum = self.database.run_query(f'SELECT AVG(sum_col) FROM ({inner_query_sum})')[0][0]
        average_avg = self.database.run_query(f'SELECT AVG(avg_col) FROM ({inner_query_avg})')[0][0]
        return round(average_sum, 2), round(average_avg, 2)

sql_generate(table_name)

Generate HAVING SQL queries and corresponding questions for categorical and numerical columns.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers" and (i) the average number of rows for each category in "colors" is 2 (ii) the mean of the average numbers for each category is 5 (iii) the average sum of numbers for each category is 10:

>>> generator = HavingGenerator(database)
>>> generator._build_having_count(table_name, ["colors"], df)
>>> generator.sql_generated
{
    "sql_tags": ["HAVING-COUNT-GR", "HAVING-COUNT-LS", "HAVING-COUNT-EQ"],
    "queries": [
        'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) >= 2',
        'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) <= 2',
        'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) = 2'
    ],
    "questions": [
        'Find all the `colors` that have at least 2 records in table `table_name`',
        'Find all the `colors` that have at most 2 records in table `table_name`',
        'Find all the `colors` that have exactly 2 records in table `table_name`'
    ]
}
>>> generator._build_having_agg(table_name, ["colors"], ["numbers"], df)
>>> generator.sql_generated
{
    "sql_tags": ["HAVING-AGG-AVG-GR", "HAVING-AGG-AVG-LS", "HAVING-AGG-SUM-GR", "HAVING-AGG-SUM-LS"],
    "queries": [
        'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING AVG(`numbers`) >= 5.0',
        'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING AVG(`numbers`) <= 5.0',
        'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING SUM(`numbers`) >= 10.0',
        'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING SUM(`numbers`) <= 10.0'
    ],
    "questions": [
        'List the `colors` which average `numbers` is at least 5.0 in table `table_name`',
        'List the `colors` which average `numbers` is at most 5.0 in table `table_name`',
        'List the `colors` which summation of `numbers` is at least 5.0 in table `table_name`',
        'List the `colors` which summation of `numbers` is at most 5.0 in table `table_name`'
    ]
}
Source code in qatch/sql_generator/having_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generate HAVING SQL queries and corresponding questions for categorical and numerical columns.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
        and (i) the average number of rows for each category in "colors" is 2
        (ii) the mean of the average numbers for each category is 5
        (iii) the average sum of numbers for each category is 10:
        >>> generator = HavingGenerator(database)
        >>> generator._build_having_count(table_name, ["colors"], df)
        >>> generator.sql_generated
        {
            "sql_tags": ["HAVING-COUNT-GR", "HAVING-COUNT-LS", "HAVING-COUNT-EQ"],
            "queries": [
                'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) >= 2',
                'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) <= 2',
                'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING count(*) = 2'
            ],
            "questions": [
                'Find all the `colors` that have at least 2 records in table `table_name`',
                'Find all the `colors` that have at most 2 records in table `table_name`',
                'Find all the `colors` that have exactly 2 records in table `table_name`'
            ]
        }
        >>> generator._build_having_agg(table_name, ["colors"], ["numbers"], df)
        >>> generator.sql_generated
        {
            "sql_tags": ["HAVING-AGG-AVG-GR", "HAVING-AGG-AVG-LS", "HAVING-AGG-SUM-GR", "HAVING-AGG-SUM-LS"],
            "queries": [
                'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING AVG(`numbers`) >= 5.0',
                'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING AVG(`numbers`) <= 5.0',
                'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING SUM(`numbers`) >= 10.0',
                'SELECT `colors` FROM `table_name` GROUP BY `colors` HAVING SUM(`numbers`) <= 10.0'
            ],
            "questions": [
                'List the `colors` which average `numbers` is at least 5.0 in table `table_name`',
                'List the `colors` which average `numbers` is at most 5.0 in table `table_name`',
                'List the `colors` which summation of `numbers` is at least 5.0 in table `table_name`',
                'List the `colors` which summation of `numbers` is at most 5.0 in table `table_name`'
            ]
        }
    """
    self.empty_sql_generated()
    df, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
    self._build_having_count(table_name, cat_cols, df)
    self._build_having_agg(table_name, cat_cols, num_cols, df)
    return self.sql_generated

JoinGenerator

Bases: AbstractSqlGenerator

A class for generating SQL queries, and questions based on input tables in a database.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Source code in qatch/sql_generator/join_generator.py
class JoinGenerator(AbstractSqlGenerator):
    """
    A class for generating SQL queries, and questions based on input tables in a database.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        This method generates SQL queries by performing JOIN operations on the
        specified table with other tables that have common columns in the database.
        It first empties any previously generated SQL queries, then it determines the
        tables to join based on common columns, and finally it generates the
        actual SQL queries.

        Args:
            table_name (str): The name of the input table.

        Returns:
            dict[str, list]: A dictionary where the keys are SQL tags and the values are lists of generated SQL queries.

        Examples:
             Assuming a MultipleDatabases object "database" with two tables, 'orders' and 'customers', 'orders' table has columns
            ['order_id', 'product', 'customer_id'] and 'customers' table has columns ['customer_id', 'name', 'address'].
            >>> generator = JoinGenerator(database)
            >>> generator._generate_join_project_all("orders")
            >>> generator.sql_generated
            >>> {'sql_tags': ['JOIN-PROJECT-ALL'],
            >>>    'queries': ['SELECT * FROM "orders" AS T1 JOIN customers AS T2 ON T1.customer_id=T2.customer_id'],
            >>>    'questions': ['Join all the records from table "orders" with table "customers" on "customer_id"']}

            >>> generator._generate_join_cat_columns("orders")
            >>> generator.sql_generated
            >>> {'sql_tags': ['JOIN-PROJECT-CAT'],
            >>>    'queries': ["SELECT T1.product, T2.name FROM orders AS T1 JOIN customers AS T2 ON T1.order_id=T2.order_id"],
            >>>    'questions': ['List all the "product" and "name" from the table "orders" and the table "customers" where "order_id" is the same']}

        Note:
            The method makes use of other internal methods to firstly
            get tables that can be joined with the given table and
            generate SQL queries on this basis.
        """
        self.empty_sql_generated()
        # get columns to perform the join operator that contains
        table_to_join2cols = self._get_table_name_to_join(table_name)
        self._generate_join_cat_columns(table_name, table_to_join2cols)
        self._generate_join_project_all(table_name, table_to_join2cols)
        return self.sql_generated

    def _generate_join_project_all(self, table_name: str, table_to_join2cols: dict):
        """
        A helper method to generate SQL queries, questions and SQL tags that join all records from two tables.

        This method constructs the join queries based on the given table name and a dictionary mapping tables to join columns.
        After constructing the queries, questions and SQL tags, it appends them to the sql_generated attribute using the
        append_sql_generated method.

        Args:
            table_name (str): The name of the table to be joined.
            table_to_join2cols (dict): A dictionary where the key is the name of the table to be joined
                                        and the value is a list of column names in the joining table.

        Example:
            Assuming we have two tables, 'orders' and 'customers', 'orders' table has columns
            ['order_id', 'product', 'customer_id'] and 'customers' table has columns ['customer_id', 'name', 'address'].

            >>> table_to_join2cols = {'customers': ['customer_id']}
            >>> _generate_join_project_all('orders', table_to_join2cols)

            After calling the method, the 'sql_generated' attribute of the class instance will contain the following:

            >>> {'sql_tags': ['JOIN-PROJECT-ALL'],
            >>>    'queries': ['SELECT * FROM "orders" AS T1 JOIN customers AS T2 ON T1.customer_id=T2.customer_id'],
            >>>    'questions': ['Join all the records from table "orders" with table "customers" on "customer_id"']}
        """
        queries, questions, sql_tags = [], [], []
        for t2, join_col in table_to_join2cols.items():
            for col in join_col:
                # create the join query
                queries.append(f'SELECT * FROM `{table_name}` AS T1 JOIN {t2} AS T2 ON T1.`{col}`=T2.`{col}`')
                questions.append(f'Join all the records from table "{table_name}" with table "{t2}" on "{col}"')
                sql_tags.append('JOIN-PROJECT-ALL')
        self.append_sql_generated(sql_tags, queries, questions)

    def _generate_join_cat_columns(self, table_name: str, table_to_join2cols: dict):
        """
        Helper method to generate SQL queries that joins categorical columns from two tables on a common column.
        Also generates corresponding questions and SQL tags.

        Args:
            table_name (str): The name of the base table for generating SQL queries.
            table_to_join2cols (dict): A dictionary containing table names as keys and list of common columns with base
            table as values. It indicates which tables and columns can be used for joining.

        Example:
            Assuming we have two tables, 'orders' and 'customers', 'orders' table has columns
            ['order_id', 'product', 'customer_id'] and 'customers' table has columns ['customer_id', 'name', 'address'].

            >>> table_to_join2cols = {'customers': ['customer_id']}
            >>> _generate_join_cat_columns('orders', table_to_join2cols)

            After calling the method, the 'sql_generated' attribute of the class instance will contain the following:

            >>> {'sql_tags': ['JOIN-PROJECT-CAT'],
            >>>    'queries': ["SELECT T1.product, T2.name FROM orders AS T1 JOIN customers AS T2 ON T1.order_id=T2.order_id"],
            >>>    'questions': ['List all the "product" and "name" from the table "orders" and the table "customers" where "order_id" is the same']}
        """
        queries, questions, sql_tags = [], [], []
        _, t1_cat_cols, _ = self._sample_cat_num_cols(table_name, 1)
        for t2, join_col in table_to_join2cols.items():
            _, t2_cat_cols, _ = self._sample_cat_num_cols(t2, 1)
            if not t1_cat_cols or not t2_cat_cols:
                # if there is no categorical column in the table, skip
                continue
            for col in join_col:
                # create the join query
                queries.append(f'SELECT T1.`{t1_cat_cols[0]}`, T2.`{t2_cat_cols[0]}` '
                               f'FROM `{table_name}` AS T1 JOIN {t2} AS T2 ON T1.`{col}`=T2.`{col}`')
                questions.append(
                    f'List all the "{t1_cat_cols[0]}" and "{t2_cat_cols[0]}" from the table "{table_name}" and the table "{t2}" '
                    f'where {col} is the same')
                sql_tags.append('JOIN-PROJECT-CAT')
        self.append_sql_generated(sql_tags, queries, questions)

    @staticmethod
    def _get_columns_to_join(tbl_1_cols: list, tbl_2_cols: list) -> list:
        """
        Returns the list of common columns from both tables that contain the keyword 'id' in their names.

        This method can be helpful in a SQL join operation to identify the common columns
        between two tables having 'id' keyword in their names.

        Args:
            tbl_1_cols (list): A list of column names from the first table.
            tbl_2_cols (list): A list of column names from the second table.

        Returns:
            list: A list of common column names between tbl_1_cols and tbl_2_cols,
            which contain the keyword 'id'.

        Example:
            >>> tbl_1_cols = ['user_id', 'username', 'email']
            >>> tbl_2_cols = ['product_id', 'user_id', 'product_name']
            >>> JoinGenerator._get_columns_to_join(tbl_1_cols, tbl_2_cols)
            ['user_id']
        """
        # remove all the columns that do not contain "id" in the name
        tbl_1_cols = {col for col in tbl_1_cols if "id" in col.lower()}
        tbl_2_cols = {col for col in tbl_2_cols if "id" in col.lower()}
        # get the columns that are in both tables
        cols_to_join = tbl_1_cols.intersection(tbl_2_cols)
        return list(cols_to_join)

    def _get_table_name_to_join(self, table_name: str) -> dict:
        """
        This function obtains all the tables that can be joined with the provided table based on the common columns.

        Args:
            table_name (str): The name of the table for which joining tables are to be obtained.

        Returns:
            dict: A dictionary with table names as keys and a list of common column names with the input table as values.

        Example:
            Consider three tables in the database: 'table1', 'table2', 'table3'.
            'table1' has columns 'A', 'B', 'C'. 'table2' has columns 'B', 'D', 'E'.
            'table3' has columns 'F', 'G'.
            Calling this function with 'table1' would return:
            {'table2': ['B']} as 'table2' can be joined with 'table1' on column 'B'.

        Notes:
            The function doesn't consider any inner join for now.
            It interacts with the database object associated with the class instance to obtain table names and schemas.
        """
        # get all the tables in the database
        tables = self.database.table_names
        t1_col = self.database.get_schema_given(table_name)['name']
        table_to_join = dict()
        for tbl in tables:
            if tbl == table_name:
                # skip inner join for now
                continue
            t2_col = self.database.get_schema_given(tbl)['name']
            cols_to_join = self._get_columns_to_join(t1_col, t2_col)
            if cols_to_join:
                table_to_join[tbl] = cols_to_join
        return table_to_join

sql_generate(table_name)

This method generates SQL queries by performing JOIN operations on the specified table with other tables that have common columns in the database. It first empties any previously generated SQL queries, then it determines the tables to join based on common columns, and finally it generates the actual SQL queries.

Parameters:

Name Type Description Default
table_name str

The name of the input table.

required

Returns:

Type Description
dict[str, list]

dict[str, list]: A dictionary where the keys are SQL tags and the values are lists of generated SQL queries.

Examples:

Assuming a MultipleDatabases object "database" with two tables, 'orders' and 'customers', 'orders' table has columns

['order_id', 'product', 'customer_id'] and 'customers' table has columns ['customer_id', 'name', 'address'].
>>> generator = JoinGenerator(database)
>>> generator._generate_join_project_all("orders")
>>> generator.sql_generated
>>> {'sql_tags': ['JOIN-PROJECT-ALL'],
>>>    'queries': ['SELECT * FROM "orders" AS T1 JOIN customers AS T2 ON T1.customer_id=T2.customer_id'],
>>>    'questions': ['Join all the records from table "orders" with table "customers" on "customer_id"']}

>>> generator._generate_join_cat_columns("orders")
>>> generator.sql_generated
>>> {'sql_tags': ['JOIN-PROJECT-CAT'],
>>>    'queries': ["SELECT T1.product, T2.name FROM orders AS T1 JOIN customers AS T2 ON T1.order_id=T2.order_id"],
>>>    'questions': ['List all the "product" and "name" from the table "orders" and the table "customers" where "order_id" is the same']}
Note

The method makes use of other internal methods to firstly get tables that can be joined with the given table and generate SQL queries on this basis.

Source code in qatch/sql_generator/join_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    This method generates SQL queries by performing JOIN operations on the
    specified table with other tables that have common columns in the database.
    It first empties any previously generated SQL queries, then it determines the
    tables to join based on common columns, and finally it generates the
    actual SQL queries.

    Args:
        table_name (str): The name of the input table.

    Returns:
        dict[str, list]: A dictionary where the keys are SQL tags and the values are lists of generated SQL queries.

    Examples:
         Assuming a MultipleDatabases object "database" with two tables, 'orders' and 'customers', 'orders' table has columns
        ['order_id', 'product', 'customer_id'] and 'customers' table has columns ['customer_id', 'name', 'address'].
        >>> generator = JoinGenerator(database)
        >>> generator._generate_join_project_all("orders")
        >>> generator.sql_generated
        >>> {'sql_tags': ['JOIN-PROJECT-ALL'],
        >>>    'queries': ['SELECT * FROM "orders" AS T1 JOIN customers AS T2 ON T1.customer_id=T2.customer_id'],
        >>>    'questions': ['Join all the records from table "orders" with table "customers" on "customer_id"']}

        >>> generator._generate_join_cat_columns("orders")
        >>> generator.sql_generated
        >>> {'sql_tags': ['JOIN-PROJECT-CAT'],
        >>>    'queries': ["SELECT T1.product, T2.name FROM orders AS T1 JOIN customers AS T2 ON T1.order_id=T2.order_id"],
        >>>    'questions': ['List all the "product" and "name" from the table "orders" and the table "customers" where "order_id" is the same']}

    Note:
        The method makes use of other internal methods to firstly
        get tables that can be joined with the given table and
        generate SQL queries on this basis.
    """
    self.empty_sql_generated()
    # get columns to perform the join operator that contains
    table_to_join2cols = self._get_table_name_to_join(table_name)
    self._generate_join_cat_columns(table_name, table_to_join2cols)
    self._generate_join_project_all(table_name, table_to_join2cols)
    return self.sql_generated

NullGenerator

Bases: AbstractSqlGenerator

A class for generating NULL SQL queries and corresponding questions based on a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Source code in qatch/sql_generator/null_generator.py
class NullGenerator(AbstractSqlGenerator):
    """
    A class for generating NULL SQL queries and corresponding questions based on a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generates NULL queries and corresponding questions.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, and questions.
                Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" with 10 NULL values:
            >>> generator = NullGenerator(database)
            >>> generator._build_null_count('table_name', ['colors'])
            >>> generator.sql_generated
            {
                "sql_tags": ['NULL-COUNT'],
                "queries": ['SELECT COUNT(*) FROM "table_name" WHERE "colors" IS NULL'],
                "questions": ['Count the rows where the values of "colors" are missing in table "table_name"']
            }
            >>> generator._build_not_null_count('table_name', ['colors'])
            >>> generator.sql_generated
            {
                "sql_tags": ['NOT-NULL-COUNT'],
                "queries": ['SELECT COUNT(*) FROM "table_name" WHERE "colors" IS NOT NULL'],
                "questions": ['Count the rows where the values of "colors" are not missing in table "table_name"']
            }
        """
        self.empty_sql_generated()
        null_cols = self._get_null_cols(table_name)
        self._build_null_count(table_name, null_cols)
        self._build_not_null_count(table_name, null_cols)
        # self._build_null_with_no_null_col(table_name)
        return self.sql_generated

    def _build_null_count(self, table_name: str, null_cols: list[str]):
        """
        Build SQL queries and questions for counting rows with NULL values in specified columns.

        Args:
            table_name (str): The name of the table in the database.
            null_cols (list): List of column names with NULL values.
        """
        queries = [f'SELECT COUNT(*) FROM `{table_name}` WHERE `{col}` IS NULL'
                   for col in null_cols]

        questions = [f'Count the rows where the values of "{col}" are missing in table "{table_name}"'
                     for col in null_cols]

        sql_tags = ['NULL-COUNT'] * len(queries)
        self.append_sql_generated(sql_tags, queries, questions)

    def _build_not_null_count(self, table_name, null_cols: list[str]):
        """
        Build SQL queries and questions for counting rows with non-NULL values in specified columns.

        Args:
            table_name (str): The name of the table in the database.
            null_cols (list): List of column names with NULL values.
        """
        queries = [f'SELECT COUNT(*) FROM `{table_name}` WHERE `{col}` IS NOT NULL'
                   for col in null_cols]

        questions = [f'Count the rows where the values of "{col}" are not missing in table "{table_name}"'
                     for col in null_cols]

        sql_tags = ['NOT-NULL-COUNT'] * len(queries)
        self.append_sql_generated(sql_tags, queries, questions)

    def _get_null_cols(self, table_name: str, sample=2):
        def _get_sample(_columns, k):
            random.seed(self.seed)
            return random.sample(_columns, k)

        """
        Randomly select columns with NULL values from the given table for generating queries.

        Args:
            table_name (str): The name of the table in the database.
            sample (int, optional): Number of columns to sample. Default is 2.

        Returns:
            list: List of column names with NULL values.
        """
        df, _, _ = self._sample_cat_num_cols(table_name)
        mask = df.isnull().any()
        cols = list(df.columns[mask])
        return _get_sample(cols, sample) if len(cols) >= sample else cols

sql_generate(table_name)

Generates NULL queries and corresponding questions.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" with 10 NULL values:

>>> generator = NullGenerator(database)
>>> generator._build_null_count('table_name', ['colors'])
>>> generator.sql_generated
{
    "sql_tags": ['NULL-COUNT'],
    "queries": ['SELECT COUNT(*) FROM "table_name" WHERE "colors" IS NULL'],
    "questions": ['Count the rows where the values of "colors" are missing in table "table_name"']
}
>>> generator._build_not_null_count('table_name', ['colors'])
>>> generator.sql_generated
{
    "sql_tags": ['NOT-NULL-COUNT'],
    "queries": ['SELECT COUNT(*) FROM "table_name" WHERE "colors" IS NOT NULL'],
    "questions": ['Count the rows where the values of "colors" are not missing in table "table_name"']
}
Source code in qatch/sql_generator/null_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generates NULL queries and corresponding questions.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" with 10 NULL values:
        >>> generator = NullGenerator(database)
        >>> generator._build_null_count('table_name', ['colors'])
        >>> generator.sql_generated
        {
            "sql_tags": ['NULL-COUNT'],
            "queries": ['SELECT COUNT(*) FROM "table_name" WHERE "colors" IS NULL'],
            "questions": ['Count the rows where the values of "colors" are missing in table "table_name"']
        }
        >>> generator._build_not_null_count('table_name', ['colors'])
        >>> generator.sql_generated
        {
            "sql_tags": ['NOT-NULL-COUNT'],
            "queries": ['SELECT COUNT(*) FROM "table_name" WHERE "colors" IS NOT NULL'],
            "questions": ['Count the rows where the values of "colors" are not missing in table "table_name"']
        }
    """
    self.empty_sql_generated()
    null_cols = self._get_null_cols(table_name)
    self._build_null_count(table_name, null_cols)
    self._build_not_null_count(table_name, null_cols)
    # self._build_null_with_no_null_col(table_name)
    return self.sql_generated

OrderByGenerator

Bases: AbstractSqlGenerator

A class for generating ORDER BY SQL queries and corresponding questions based on a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Source code in qatch/sql_generator/orderby_generator.py
class OrderByGenerator(AbstractSqlGenerator):
    """
    A class for generating ORDER BY SQL queries and corresponding questions based on a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generate ORDER BY queries and corresponding questions based on the specified table.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, and questions.
                Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
            >>> generator = OrderByGenerator(database)
            >>> generator._generate_order_asc("table_name", ["colors", "numbers"])
            >>> generator.sql_generated
            {
                "sql_tags": ["ORDERBY-SINGLE", "ORDERBY-SINGLE"],
                "queries": [
                    'SELECT * FROM "table_name" ORDER BY "colors" ASC',
                    'SELECT * FROM "table_name" ORDER BY "numbers" ASC'
                ],
                "questions": [
                    'Show all data ordered by "colors" in ascending order for the table "table_name"',
                    'Show all data ordered by "numbers" in ascending order for the table "table_name"'
                ]
            }
            >>> generator._generate_order_desc("table_name", ["colors", "numbers"])
            >>> generator.sql_generated
            {
                "sql_tags": ["ORDERBY-SINGLE", "ORDERBY-SINGLE"],
                "queries": [
                    'SELECT * FROM "table_name" ORDER BY "colors" DESC',
                    'SELECT * FROM "table_name" ORDER BY "numbers" DESC'
                ],
                "questions": [
                    'Show all data ordered by "colors" in descending order for the table "table_name"',
                    'Show all data ordered by "numbers" in descending order for the table "table_name"'
                ]
            }
            >>> generator._generate_order_asc_project("table_name", ["colors", "numbers"])
            >>> generator.sql_generated
            {
                "sql_tags": ["ORDERBY-PROJECT", "ORDERBY-PROJECT"],
                "queries": [
                    'SELECT "colors" FROM "table_name" ORDER BY "colors" ASC',
                    'SELECT "numbers" FROM "table_name" ORDER BY "numbers" ASC'
                ],
                "questions": [
                    'Project the "colors" ordered in ascending order for the table "table_name"',
                    'Project the "numbers" ordered in ascending order for the table "table_name"'
                ]
            }
            >>> generator._generate_order_desc_project("table_name", ["colors", "numbers"])
            >>> generator.sql_generated
            {
                "sql_tags": ["ORDERBY-PROJECT", "ORDERBY-PROJECT"],
                "queries": [
                    'SELECT "colors" FROM "table_name" ORDER BY "colors" DESC',
                    'SELECT "numbers" FROM "table_name" ORDER BY "numbers" DESC'
                ],
                "questions": [
                    'Project the "colors" ordered in descending order for the table "table_name"',
                    'Project the "numbers" ordered in descending order for the table "table_name"'
                ]
            }
        """
        self.empty_sql_generated()
        # to avoid too many ORDERBY sql queries, sample only 2 Cate and 2 numerical columns
        _, cat_cols, num_cols = self._sample_cat_num_cols(table_name, sample=2)
        columns = cat_cols + num_cols
        self._generate_order_asc(table_name, columns)
        self._generate_order_desc(table_name, columns)

        self._generate_order_asc_project(table_name, columns)

        self._generate_order_desc_project(table_name, columns)
        return self.sql_generated

    def _generate_order_asc(self, table_name: str, columns: list[str]):
        """
        Generates SQL queries and questions for ordering data in ascending order for each column.

        Args:
            table_name (str): The name of the table in the database.
            columns (list): List of column names.
        """
        queries = [f'SELECT * FROM `{table_name}` ORDER BY `{col}` ASC'
                   for col in columns]

        questions = [
            f'Show all data ordered by "{col}" in ascending order for the table "{table_name}"'
            for col in columns
        ]
        sql_tags = ['ORDERBY-SINGLE'] * len(queries)
        self.append_sql_generated(sql_tags, queries, questions)

    def _generate_order_desc(self, table_name: str, columns: list[str]):
        """
        Generates SQL queries and questions for ordering data in descending order for each column.

        Args:
            table_name (str): The name of the table in the database.
            columns (list): List of column names.
        """
        queries = [f'SELECT * FROM `{table_name}` ORDER BY `{col}` DESC'
                   for col in columns]

        questions = [
            f'Show all data ordered by {col} in descending order for the table {table_name}'
            for col in columns
        ]
        sql_tags = ['ORDERBY-SINGLE'] * len(queries)
        self.append_sql_generated(sql_tags, queries, questions)

    def _generate_order_asc_project(self, table_name: str, columns: list[str]):
        """
        Generates SQL queries and questions for projecting a single column and ordering it in ascending order.

        Args:
            table_name (str): The name of the table in the database.
            columns (list): List of column names.
        """
        queries = [f'SELECT `{col}` FROM `{table_name}` ORDER BY `{col}` ASC'
                   for col in columns]

        questions = [
            f'Project the "{col}" ordered in ascending order for the table {table_name}'
            for col in columns
        ]
        sql_tags = ['ORDERBY-PROJECT'] * len(queries)
        self.append_sql_generated(sql_tags, queries, questions)

    def _generate_order_desc_project(self, table_name: str, columns: list[str]):
        """
        Generates SQL queries and questions for projecting a single column and ordering it in descending order.

        Args:
            table_name (str): The name of the table in the database.
            columns (list): List of column names.
        """
        queries = [f'SELECT `{col}` FROM `{table_name}` ORDER BY `{col}` DESC'
                   for col in columns]

        questions = [
            f'Project the "{col}" ordered in descending order for the table {table_name}'
            for col in columns
        ]
        sql_tags = ['ORDERBY-PROJECT'] * len(queries)
        self.append_sql_generated(sql_tags, queries, questions)

sql_generate(table_name)

Generate ORDER BY queries and corresponding questions based on the specified table.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"

>>> generator = OrderByGenerator(database)
>>> generator._generate_order_asc("table_name", ["colors", "numbers"])
>>> generator.sql_generated
{
    "sql_tags": ["ORDERBY-SINGLE", "ORDERBY-SINGLE"],
    "queries": [
        'SELECT * FROM "table_name" ORDER BY "colors" ASC',
        'SELECT * FROM "table_name" ORDER BY "numbers" ASC'
    ],
    "questions": [
        'Show all data ordered by "colors" in ascending order for the table "table_name"',
        'Show all data ordered by "numbers" in ascending order for the table "table_name"'
    ]
}
>>> generator._generate_order_desc("table_name", ["colors", "numbers"])
>>> generator.sql_generated
{
    "sql_tags": ["ORDERBY-SINGLE", "ORDERBY-SINGLE"],
    "queries": [
        'SELECT * FROM "table_name" ORDER BY "colors" DESC',
        'SELECT * FROM "table_name" ORDER BY "numbers" DESC'
    ],
    "questions": [
        'Show all data ordered by "colors" in descending order for the table "table_name"',
        'Show all data ordered by "numbers" in descending order for the table "table_name"'
    ]
}
>>> generator._generate_order_asc_project("table_name", ["colors", "numbers"])
>>> generator.sql_generated
{
    "sql_tags": ["ORDERBY-PROJECT", "ORDERBY-PROJECT"],
    "queries": [
        'SELECT "colors" FROM "table_name" ORDER BY "colors" ASC',
        'SELECT "numbers" FROM "table_name" ORDER BY "numbers" ASC'
    ],
    "questions": [
        'Project the "colors" ordered in ascending order for the table "table_name"',
        'Project the "numbers" ordered in ascending order for the table "table_name"'
    ]
}
>>> generator._generate_order_desc_project("table_name", ["colors", "numbers"])
>>> generator.sql_generated
{
    "sql_tags": ["ORDERBY-PROJECT", "ORDERBY-PROJECT"],
    "queries": [
        'SELECT "colors" FROM "table_name" ORDER BY "colors" DESC',
        'SELECT "numbers" FROM "table_name" ORDER BY "numbers" DESC'
    ],
    "questions": [
        'Project the "colors" ordered in descending order for the table "table_name"',
        'Project the "numbers" ordered in descending order for the table "table_name"'
    ]
}
Source code in qatch/sql_generator/orderby_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generate ORDER BY queries and corresponding questions based on the specified table.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
        >>> generator = OrderByGenerator(database)
        >>> generator._generate_order_asc("table_name", ["colors", "numbers"])
        >>> generator.sql_generated
        {
            "sql_tags": ["ORDERBY-SINGLE", "ORDERBY-SINGLE"],
            "queries": [
                'SELECT * FROM "table_name" ORDER BY "colors" ASC',
                'SELECT * FROM "table_name" ORDER BY "numbers" ASC'
            ],
            "questions": [
                'Show all data ordered by "colors" in ascending order for the table "table_name"',
                'Show all data ordered by "numbers" in ascending order for the table "table_name"'
            ]
        }
        >>> generator._generate_order_desc("table_name", ["colors", "numbers"])
        >>> generator.sql_generated
        {
            "sql_tags": ["ORDERBY-SINGLE", "ORDERBY-SINGLE"],
            "queries": [
                'SELECT * FROM "table_name" ORDER BY "colors" DESC',
                'SELECT * FROM "table_name" ORDER BY "numbers" DESC'
            ],
            "questions": [
                'Show all data ordered by "colors" in descending order for the table "table_name"',
                'Show all data ordered by "numbers" in descending order for the table "table_name"'
            ]
        }
        >>> generator._generate_order_asc_project("table_name", ["colors", "numbers"])
        >>> generator.sql_generated
        {
            "sql_tags": ["ORDERBY-PROJECT", "ORDERBY-PROJECT"],
            "queries": [
                'SELECT "colors" FROM "table_name" ORDER BY "colors" ASC',
                'SELECT "numbers" FROM "table_name" ORDER BY "numbers" ASC'
            ],
            "questions": [
                'Project the "colors" ordered in ascending order for the table "table_name"',
                'Project the "numbers" ordered in ascending order for the table "table_name"'
            ]
        }
        >>> generator._generate_order_desc_project("table_name", ["colors", "numbers"])
        >>> generator.sql_generated
        {
            "sql_tags": ["ORDERBY-PROJECT", "ORDERBY-PROJECT"],
            "queries": [
                'SELECT "colors" FROM "table_name" ORDER BY "colors" DESC',
                'SELECT "numbers" FROM "table_name" ORDER BY "numbers" DESC'
            ],
            "questions": [
                'Project the "colors" ordered in descending order for the table "table_name"',
                'Project the "numbers" ordered in descending order for the table "table_name"'
            ]
        }
    """
    self.empty_sql_generated()
    # to avoid too many ORDERBY sql queries, sample only 2 Cate and 2 numerical columns
    _, cat_cols, num_cols = self._sample_cat_num_cols(table_name, sample=2)
    columns = cat_cols + num_cols
    self._generate_order_asc(table_name, columns)
    self._generate_order_desc(table_name, columns)

    self._generate_order_asc_project(table_name, columns)

    self._generate_order_desc_project(table_name, columns)
    return self.sql_generated

SelectGenerator

Bases: AbstractSqlGenerator

A class for generating SELECT SQL queries and corresponding questions based on a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Source code in qatch/sql_generator/select_generator.py
class SelectGenerator(AbstractSqlGenerator):
    """
    A class for generating SELECT SQL queries and corresponding questions based on a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generate SQL tags, queries, and questions based on the specified table.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, and questions.
                Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "name", "colors" and "numbers":
            >>> generator = SelectGenerator(database)
            >>> generator._select_all_table("table_name")
            >>> generator.sql_generated
            {
                "sql_tags": ["SELECT-ALL"],
                "queries": ['SELECT * FROM "table_name"'],
                "questions": ["Show all the rows in the table table_name"]
            }
            >>> generator._select_add_col("table_name")
            >>> generator.sql_generated
            {
                "sql_tags": ["SELECT-ADD-COL", "SELECT-ADD-COL", "SELECT-ADD-COL"],
                "queries": ['SELECT "colors" FROM "table_name"',
                            'SELECT "colors", "numbers" FROM "table_name"'
                            'SELECT "colors", "numbers", "name" FROM "table_name"'],
                "questions": ["Show all colors in the table table_name",
                              "Show all colors, numbers in the table table_name"
                              "Show all colors, numbers, name in the table table_name"]
            }
            >>> generator._select_random_col("table_name")
            >>> generator.sql_generated
            {
                "sql_tags": ["SELECT-RANDOM-COL", "SELECT-RANDOM-COL"],
                "queries": ['SELECT "colors" FROM "table_name"',
                            'SELECT "name", "numbers" FROM "table_name"',
                            'SELECT "numbers", "colors", "name" FROM "table_name"'],
                "questions": ["Show all colors in the table table_name",
                              "Show all name, numbers in the table table_name",
                              "Show all numbers, colors, name in the table table_name"]
            }

        """
        self.empty_sql_generated()
        self._select_all_table(table_name)
        self._select_add_col(table_name)
        self._select_random_col(table_name)
        return self.sql_generated

    def _select_single_col(self, table_name):
        """Generates SELECT SQL queries and corresponding questions for a single column of a given table.

        This method selects each column of a given table one by one and generates SQL queries
        and corresponding questions. These queries and questions are stored in the 'sql_generated' dictionary attribute of the class.

        Args:
            table_name (str): Name of the table.
        Example:
            Assume the 'users' table has two columns 'id' and 'name'.

                >>> self.database = SingleDatabase('sqlite:///my_db.sqlite3')
                >>> sql_gen = SelectGenerator(self.database)
                >>> sql_gen._select_single_col('users')
                ...
                >>> print(sql_gen.sql_generated)
                {
                    "sql_tags": ["SELECT-SINGLE-COL", "SELECT-SINGLE-COL"],
                    "queries": ["SELECT "id" FROM users;", "SELECT "name" FROM users;"],
                    "questions": ["What are the "id" for all users?", "What are the "name" of all users?"]
                }
        """
        columns = self.database.get_schema_given(table_name).name.tolist()
        columns = [[col] for col in columns]
        # sort columns
        questions = self._build_questions(columns, table_name)
        queries = self._build_queries(columns, table_name)
        self.append_sql_generated(sql_tags=['SELECT-SINGLE-COL'] * len(queries),
                                  queries=queries,
                                  questions=questions)
        return self.sql_generated

    def _select_all_table(self, table_name: str):
        """
        Generate the SQL query and question for selecting all rows in the table.

        Args:
            table_name (str): The name of the table in the database.
        """
        sql_tag = ['SELECT-ALL']
        query = [f'SELECT * FROM `{table_name}`']
        question = [f"Show all the rows in the table {table_name}"]
        self.append_sql_generated(sql_tags=sql_tag, queries=query, questions=question)

    def _select_add_col(self, table_name: str):
        """
        Generate the SQL query and question for selecting increasingly more columns in the table.

        Args:
            table_name (str): The name of the table in the database.
        """
        columns = self.database.get_schema_given(table_name).name.tolist()
        comb_cols_add = self._comb_add_columns(columns)

        questions = self._build_questions(comb_cols_add, table_name)
        queries = self._build_queries(comb_cols_add, table_name)

        self.append_sql_generated(sql_tags=['SELECT-ADD-COL'] * len(comb_cols_add),
                                  queries=queries, questions=questions)

    def _select_random_col(self, table_name: str):
        """
        Generate the SQL query and question for selecting random columns in the table.

        Args:
            table_name (str): The name of the table in the database.
        """
        columns = self.database.get_schema_given(table_name).name.tolist()
        comb_cols_rand = self._comb_random(columns)

        questions = self._build_questions(comb_cols_rand, table_name)
        queries = self._build_queries(comb_cols_rand, table_name)

        self.append_sql_generated(sql_tags=['SELECT-RANDOM-COL'] * len(comb_cols_rand),
                                  queries=queries, questions=questions)

    def _build_questions(self, combinations: list[list[str]], table_name) -> list[str]:
        """
        Builds questions corresponding to the given column combinations and table name.

        Args:
            combinations (list[list[str]]): List of column combinations.
            table_name (str): The name of the table in the database.

        Returns:
            list[str]: A list of questions corresponding to the column combinations.
        """
        return [f'Show all {self._get_col_comb_str(comb)} in the table {table_name}'
                for comb in combinations]

    def _build_queries(self, combinations: list[list[str]], table_name: str) -> list[str]:
        """
        Builds SQL queries corresponding to the given column combinations and table name.

        Args:
            combinations (list[list[str]]): List of column combinations.
            table_name (str): The name of the table in the database.

        Returns:
            list[str]: A list of SQL queries corresponding to the column combinations.
        """
        return [f'SELECT {self._get_col_comb_str(comb)} FROM `{table_name}`'
                for comb in combinations]

    @staticmethod
    def _comb_add_columns(columns: list[str]) -> list[list[str]]:
        """
        Generates column combinations by incrementally adding columns to the query.

        Args:
            columns (list[str]): List of column names.

        Returns:
            list[list[str]]: A list of column combinations.
        """
        return [columns[:i] for i in range(1, len(columns))]

sql_generate(table_name)

Generate SQL tags, queries, and questions based on the specified table.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "name", "colors" and "numbers":

>>> generator = SelectGenerator(database)
>>> generator._select_all_table("table_name")
>>> generator.sql_generated
{
    "sql_tags": ["SELECT-ALL"],
    "queries": ['SELECT * FROM "table_name"'],
    "questions": ["Show all the rows in the table table_name"]
}
>>> generator._select_add_col("table_name")
>>> generator.sql_generated
{
    "sql_tags": ["SELECT-ADD-COL", "SELECT-ADD-COL", "SELECT-ADD-COL"],
    "queries": ['SELECT "colors" FROM "table_name"',
                'SELECT "colors", "numbers" FROM "table_name"'
                'SELECT "colors", "numbers", "name" FROM "table_name"'],
    "questions": ["Show all colors in the table table_name",
                  "Show all colors, numbers in the table table_name"
                  "Show all colors, numbers, name in the table table_name"]
}
>>> generator._select_random_col("table_name")
>>> generator.sql_generated
{
    "sql_tags": ["SELECT-RANDOM-COL", "SELECT-RANDOM-COL"],
    "queries": ['SELECT "colors" FROM "table_name"',
                'SELECT "name", "numbers" FROM "table_name"',
                'SELECT "numbers", "colors", "name" FROM "table_name"'],
    "questions": ["Show all colors in the table table_name",
                  "Show all name, numbers in the table table_name",
                  "Show all numbers, colors, name in the table table_name"]
}
Source code in qatch/sql_generator/select_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generate SQL tags, queries, and questions based on the specified table.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "name", "colors" and "numbers":
        >>> generator = SelectGenerator(database)
        >>> generator._select_all_table("table_name")
        >>> generator.sql_generated
        {
            "sql_tags": ["SELECT-ALL"],
            "queries": ['SELECT * FROM "table_name"'],
            "questions": ["Show all the rows in the table table_name"]
        }
        >>> generator._select_add_col("table_name")
        >>> generator.sql_generated
        {
            "sql_tags": ["SELECT-ADD-COL", "SELECT-ADD-COL", "SELECT-ADD-COL"],
            "queries": ['SELECT "colors" FROM "table_name"',
                        'SELECT "colors", "numbers" FROM "table_name"'
                        'SELECT "colors", "numbers", "name" FROM "table_name"'],
            "questions": ["Show all colors in the table table_name",
                          "Show all colors, numbers in the table table_name"
                          "Show all colors, numbers, name in the table table_name"]
        }
        >>> generator._select_random_col("table_name")
        >>> generator.sql_generated
        {
            "sql_tags": ["SELECT-RANDOM-COL", "SELECT-RANDOM-COL"],
            "queries": ['SELECT "colors" FROM "table_name"',
                        'SELECT "name", "numbers" FROM "table_name"',
                        'SELECT "numbers", "colors", "name" FROM "table_name"'],
            "questions": ["Show all colors in the table table_name",
                          "Show all name, numbers in the table table_name",
                          "Show all numbers, colors, name in the table table_name"]
        }

    """
    self.empty_sql_generated()
    self._select_all_table(table_name)
    self._select_add_col(table_name)
    self._select_random_col(table_name)
    return self.sql_generated

SimpleAggGenerator

Bases: AbstractSqlGenerator

A class for generating Simple Aggregation queries and corresponding questions based on a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}

Source code in qatch/sql_generator/simple_agg_generator.py
class SimpleAggGenerator(AbstractSqlGenerator):
    """
    A class for generating Simple Aggregation queries and corresponding questions based on a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, and questions.
            Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generates Simple Aggregation SQL queries and corresponding questions for the specified table.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, and questions.
                  Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
            >>> generator = SimpleAggGenerator(database)
            >>> generator._build_count_cat("table_name", ["colors"])
            >>> generator.sql_generated
            {
                "sql_tags": ["SIMPLE-AGG-COUNT", "SIMPLE-AGG-COUNT-DISTINCT"],
                "queries": [
                    'SELECT COUNT(*) FROM "table_name"',
                    'SELECT COUNT(DISTINCT"colors") FROM "table_name"'
                ],
                "questions": [
                    'Count the records in table "table_name"?',
                    'How many different "colors" are in table "table_name"?'
                ]
            }
            >>> generator._build_count_agg("table_name", ["numbers"])
            >>> generator.sql_generated
            {
                "sql_tags": ["SIMPLE-AGG-MAX", "SIMPLE-AGG-MIN", "SIMPLE-AGG-AVG"],
                "queries": [
                    'SELECT MAX("numbers") FROM "table_name"',
                    'SELECT MIN("numbers") FROM "table_name"',
                    'SELECT AVG("numbers") FROM "table_name"'
                ],
                "questions": [
                    'Find the maximum "numbers" for the table "table_name"',
                    'Find the minimum "numbers" for the table "table_name"',
                    'Find the average "numbers" for the table "table_name"'
                ]
            }
        """
        self.empty_sql_generated()
        _, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
        self._build_count_cat(table_name, cat_cols)
        self._build_count_agg(table_name, num_cols)
        return self.sql_generated

    def _build_count_cat(self, table_name, cat_cols):
        """
        Generates COUNT SQL queries and questions for categorical columns.

        Args:
            table_name (str): The name of the table in the database.
            cat_cols (List[str]): List of categorical columns in the table.
        """

        queries = [f'SELECT COUNT(*) FROM `{table_name}`']
        questions = [f'Count the records in table "{table_name}"?']
        sql_tags = ['SIMPLE-AGG-COUNT']

        for cat_col in cat_cols:
            queries += [f'SELECT COUNT(DISTINCT `{cat_col}`) FROM `{table_name}`']
            questions += [f'How many different "{cat_col}" are in table "{table_name}"?']
            sql_tags += ['SIMPLE-AGG-COUNT-DISTINCT']

        self.append_sql_generated(sql_tags, queries, questions)

    def _build_count_agg(self, table_name, num_cols):
        """
        Generates MAX, MIN, and AVG SQL queries and questions for numerical columns.

        Args:
            table_name (str): The name of the table in the database.
            num_cols (List[str]): List of numerical columns in the table.
        """
        for num_col in num_cols:
            queries = [
                f'SELECT MAX(`{num_col}`) FROM `{table_name}`',
                f'SELECT MIN(`{num_col}`) FROM `{table_name}`',
                f'SELECT AVG(`{num_col}`) FROM `{table_name}`'
            ]
            questions = [
                f'Find the maximum "{num_col}" for the table "{table_name}"',
                f'Find the minimum "{num_col}" for the table "{table_name}"',
                f'Find the average "{num_col}" for the table "{table_name}"'
            ]
            sql_tags = ['SIMPLE-AGG-MAX', 'SIMPLE-AGG-MIN', 'SIMPLE-AGG-AVG']
            self.append_sql_generated(sql_tags, queries, questions)

sql_generate(table_name)

Generates Simple Aggregation SQL queries and corresponding questions for the specified table.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, and questions. Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"

>>> generator = SimpleAggGenerator(database)
>>> generator._build_count_cat("table_name", ["colors"])
>>> generator.sql_generated
{
    "sql_tags": ["SIMPLE-AGG-COUNT", "SIMPLE-AGG-COUNT-DISTINCT"],
    "queries": [
        'SELECT COUNT(*) FROM "table_name"',
        'SELECT COUNT(DISTINCT"colors") FROM "table_name"'
    ],
    "questions": [
        'Count the records in table "table_name"?',
        'How many different "colors" are in table "table_name"?'
    ]
}
>>> generator._build_count_agg("table_name", ["numbers"])
>>> generator.sql_generated
{
    "sql_tags": ["SIMPLE-AGG-MAX", "SIMPLE-AGG-MIN", "SIMPLE-AGG-AVG"],
    "queries": [
        'SELECT MAX("numbers") FROM "table_name"',
        'SELECT MIN("numbers") FROM "table_name"',
        'SELECT AVG("numbers") FROM "table_name"'
    ],
    "questions": [
        'Find the maximum "numbers" for the table "table_name"',
        'Find the minimum "numbers" for the table "table_name"',
        'Find the average "numbers" for the table "table_name"'
    ]
}
Source code in qatch/sql_generator/simple_agg_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generates Simple Aggregation SQL queries and corresponding questions for the specified table.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, and questions.
              Format: {"sql_tags": List[str], "queries": List[str], "questions": List[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
        >>> generator = SimpleAggGenerator(database)
        >>> generator._build_count_cat("table_name", ["colors"])
        >>> generator.sql_generated
        {
            "sql_tags": ["SIMPLE-AGG-COUNT", "SIMPLE-AGG-COUNT-DISTINCT"],
            "queries": [
                'SELECT COUNT(*) FROM "table_name"',
                'SELECT COUNT(DISTINCT"colors") FROM "table_name"'
            ],
            "questions": [
                'Count the records in table "table_name"?',
                'How many different "colors" are in table "table_name"?'
            ]
        }
        >>> generator._build_count_agg("table_name", ["numbers"])
        >>> generator.sql_generated
        {
            "sql_tags": ["SIMPLE-AGG-MAX", "SIMPLE-AGG-MIN", "SIMPLE-AGG-AVG"],
            "queries": [
                'SELECT MAX("numbers") FROM "table_name"',
                'SELECT MIN("numbers") FROM "table_name"',
                'SELECT AVG("numbers") FROM "table_name"'
            ],
            "questions": [
                'Find the maximum "numbers" for the table "table_name"',
                'Find the minimum "numbers" for the table "table_name"',
                'Find the average "numbers" for the table "table_name"'
            ]
        }
    """
    self.empty_sql_generated()
    _, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
    self._build_count_cat(table_name, cat_cols)
    self._build_count_agg(table_name, num_cols)
    return self.sql_generated

WhereGenerator

Bases: AbstractSqlGenerator

A class for generating WHERE SQL queries and corresponding questions based on a database table.

Attributes:

Name Type Description
database SingleDatabase

The SingleDatabase object representing the database to generate queries from.

sql_generated dict

A dictionary containing generated SQL tags, queries, questions, and results. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str], "results": list[pd.DataFrame]}

Source code in qatch/sql_generator/where_generator.py
class WhereGenerator(AbstractSqlGenerator):
    """
    A class for generating WHERE SQL queries and corresponding questions based on a database table.

    Attributes:
        database (SingleDatabase): The SingleDatabase object representing the database to generate queries from.
        sql_generated (dict): A dictionary containing generated SQL tags, queries, questions, and results.
            Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str], "results": list[pd.DataFrame]}
    """

    def sql_generate(self, table_name: str) -> dict[str, list]:
        """
        Generates WHERE SQL queries and corresponding questions for both categorical and numerical columns.

        Args:
            table_name (str): The name of the table in the database.

        Returns:
            dict: A dictionary containing generated SQL tags, queries, questions, and results.
                  Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

        Examples:
            Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
            >>> sample_df = pd.DataFrame({"colors": ["green", "blue", "blue", "blue", "blue"], "numbers": [1, 2, 3, 4, 5]})
            >>> generator = WhereGenerator(database)
            >>> generator._generate_where_categorical("table_name", ["colors"], sample_df)
            >>> generator.sql_generated
            {
                "sql_tags": ["WHERE-CAT-MOST-FREQUENT", "WHERE-CAT-LEAST-FREQUENT",
                            'WHERE-CAT-MOST-FREQUENT', 'WHERE-CAT-LEAST-FREQUENT',
                            "WHERE-NOT-MOST-FREQUENT", "WHERE-NOT-LEAST-FREQUENT"],
                "queries": [
                    'SELECT * FROM "table_name" WHERE "colors" == "blue"',
                    'SELECT * FROM "table_name" WHERE "colors" == "green"',
                    'SELECT * FROM "table_name" WHERE "colors" != "blue"',
                    'SELECT * FROM "table_name" WHERE "colors" != "green"',
                    'SELECT * FROM "table_name" WHERE NOT "colors" == "blue"',
                    'SELECT * FROM "table_name" WHERE NOT "colors" == "green"',
                ],
                "questions": [
                    'Show the data of the table "table_name" where "colors" is equal to blue',
                    'Show the data of the table "table_name" where "colors" is equal to green',
                    'Show the data of the table "table_name" where "colors" is different from blue',
                    'Show the data of the table "table_name" where "colors" is different from green',
                    'Show the data of the table "table_name" where "colors" is not equal to blue',
                    'Show the data of the table "table_name" where "colors" is not equal to green',
                ]
            }
            >>> generator._generate_where_numerical("table_name", ["numbers"], sample_df)
            >>> generator.sql_generated
            {
                "sql_tags": ['WHERE-NUM-MAX-VALUES', 'WHERE-NUM-MIN-VALUES',
                            'WHERE-NUM-MEAN-VALUES', 'WHERE-NUM-MEAN-VALUES'],
                "queries": ['SELECT * FROM "table_name" WHERE "numbers" < 5',
                            'SELECT * FROM "table_name" WHERE "numbers" > 1,
                            'SELECT * FROM "table_name" WHERE "numbers" > 3.0'
                            'SELECT * FROM "table_name" WHERE "numbers" < 3.0'],
                "question": ['Show the data of the table "table_name" where "numbers" is less than 5',
                            'Show the data of the table "table_name" where "numbers" is greater than 1',
                            'Show the data of the table "table_name" where "numbers" is greater than 3.0',
                            'Show the data of the table "table_name" where "numbers" is less than 3.0'],
            }
        """
        self.empty_sql_generated()
        df, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
        self._generate_where_categorical(table_name, cat_cols, df)
        self._generate_where_numerical(table_name, num_cols, df)
        return self.sql_generated

    def _generate_where_categorical(self, table_name: str, cat_cols: list, df: pd.DataFrame):
        """
        Generates WHERE SQL queries and questions for categorical columns.
        Generates test for both most frequent and least frequent values.

        Args:
            table_name (str): The name of the table in the database.
            cat_cols (list): List of categorical columns.
            df (pd.DataFrame): The DataFrame containing the data.
        """
        if len(cat_cols) == 0:
            # no categorical attributes present
            return
        most_frequent_elements = [self._get_most_frequent_or_max_value(df[col].values) for col in cat_cols]
        least_frequent_elements = [self._get_least_frequent_or_min_value(df[col].values) for col in cat_cols]
        for col, most_freq, least_freq in zip(cat_cols, most_frequent_elements, least_frequent_elements):
            queries = [
                f"""SELECT * FROM `{table_name}` WHERE `{col}` == `{most_freq}` """,
                f"""SELECT * FROM `{table_name}` WHERE `{col}` == `{least_freq}` """,
                f"""SELECT * FROM `{table_name}` WHERE `{col}` != `{most_freq}` """,
                f"""SELECT * FROM `{table_name}` WHERE `{col}` != `{least_freq}` """,
                f"""SELECT * FROM `{table_name}` WHERE NOT `{col}` == `{most_freq}` """,
                f"""SELECT * FROM `{table_name}` WHERE NOT `{col}` == `{least_freq}` """,
            ]

            questions = [
                f'Show the data of the table "{table_name}" where "{col}" is equal to {most_freq}',
                f'Show the data of the table "{table_name}" where "{col}" is equal to {least_freq}',
                f'Show the data of the table "{table_name}" where "{col}" is different from {most_freq}',
                f'Show the data of the table "{table_name}" where "{col}" is different from {least_freq}',
                f'Show the data of the table "{table_name}" where "{col}" is not equal to {most_freq}',
                f'Show the data of the table "{table_name}" where "{col}" is not equal to {least_freq}',
            ]

            sql_tags = ['WHERE-CAT-MOST-FREQUENT', 'WHERE-CAT-LEAST-FREQUENT',
                        'WHERE-CAT-MOST-FREQUENT', 'WHERE-CAT-LEAST-FREQUENT',
                        'WHERE-NOT-MOST-FREQUENT', 'WHERE-NOT-LEAST-FREQUENT']
            self.append_sql_generated(sql_tags, queries, questions)

    def _generate_where_numerical(self, table_name: str, num_cols: list, df: pd.DataFrame):
        """
        Generates WHERE SQL queries and questions for numerical columns.
        Generates test for max, min, and mean values.

        Args:
            table_name (str): The name of the table in the database.
            num_cols (list): List of numerical columns.
            df (pd.DataFrame): The DataFrame containing the data.
        """

        def _generate_given_value(number, n_col):
            queries_n = [
                f'SELECT * FROM `{table_name}` WHERE `{n_col}` > {number}',
                f'SELECT * FROM `{table_name}` WHERE `{n_col}` < {number}',
            ]
            questions_n = [
                f'Show the data of the table "{table_name}" where "{n_col}" is greater than {number}',
                f'Show the data of the table "{table_name}" where "{n_col}" is less than {number}',
            ]
            return queries_n, questions_n

        if len(num_cols) == 0:
            return
        max_elements = [self._get_most_frequent_or_max_value(df[col].values)
                        for col in num_cols]
        min_elements = [self._get_least_frequent_or_min_value(df[col].values)
                        for col in num_cols]
        mean_values = [self._get_median_value(df[col].values) for col in num_cols]
        for col, max_value, min_value, mean_value in zip(num_cols, max_elements,
                                                         min_elements, mean_values):
            queries, questions = _generate_given_value(max_value, col)
            sql_tags = ['WHERE-NUM-MAX-VALUES-EMPTY', 'WHERE-NUM-MAX-VALUES']
            # avoid empty results
            self.append_sql_generated(sql_tags[1:], queries[1:], questions[1:])

            queries, questions = _generate_given_value(min_value, col)
            sql_tags = ['WHERE-NUM-MIN-VALUES', 'WHERE-NUM-MIN-VALUES-EMPTY']
            # avoid empty results
            self.append_sql_generated(sql_tags[:1], queries[:1], questions[:1])

            queries, questions = _generate_given_value(mean_value, col)
            sql_tags = ['WHERE-NUM-MEAN-VALUES'] * len(queries)
            self.append_sql_generated(sql_tags, queries, questions)

    @staticmethod
    def _get_most_frequent_or_max_value(values: np.array):
        """
        Returns the most frequent value if the input is categorical, or the maximum value if numerical.
        Null values are not considered in the calculation.

        Args:
            values (np.array): Array of values, either categorical or numerical.

        Returns:
            Union[None, Any]: Most frequent value if categorical, max value if numerical, or None if input is empty.
        """
        if len(values) == 0:
            return None
        values = values[~pd.isna(values)]
        # update the dtype after removing the null values
        values = np.array(values.tolist())
        if np.issubdtype(values.dtype, np.number):
            return np.max(values)
        else:
            unique_values, counts = np.unique(values, return_counts=True)
            index_of_max_count = np.argmax(counts)
            most_frequent_value: str = unique_values[index_of_max_count]
            return most_frequent_value.replace('"', '').replace("'", '').strip()

    @staticmethod
    def _get_least_frequent_or_min_value(values):
        """
        Returns the least frequent value if the input is categorical, or the minimum value if numerical.
        Null values are not considered in the calculation.

        Args:
            values (np.array): Array of values, either categorical or numerical.

        Returns:
            Union[None, Any]: Least frequent value if categorical, min value if numerical, or None if input is empty.
        """
        if len(values) == 0:
            return None
        values = values[~pd.isna(values)]
        # update the dtype after removing the null values
        values = np.array(values.tolist())
        if np.issubdtype(values.dtype, np.number):
            return np.min(values)
        else:
            unique_values, counts = np.unique(values, return_counts=True)
            index_of_min_count = np.argmin(counts)
            lest_frequent_value: str = unique_values[index_of_min_count]
            return lest_frequent_value.replace('"', '').replace("'", '').strip()

    @staticmethod
    def _get_median_value(values):
        """
        Returns the median value if the input is numerical. Null values are not considered in the calculation.

        Args:
            values (np.array): Array of numerical values.

        Returns:
            Union[None, float]: Mean value of the input array, or None if input is empty or non-numerical.
        """
        if len(values) == 0:
            return None
        values = values[~pd.isna(values)]
        if np.issubdtype(values.dtype, np.number):
            return np.mean(values)
        else:
            return None

sql_generate(table_name)

Generates WHERE SQL queries and corresponding questions for both categorical and numerical columns.

Parameters:

Name Type Description Default
table_name str

The name of the table in the database.

required

Returns:

Name Type Description
dict dict[str, list]

A dictionary containing generated SQL tags, queries, questions, and results. Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

Examples:

Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"

>>> sample_df = pd.DataFrame({"colors": ["green", "blue", "blue", "blue", "blue"], "numbers": [1, 2, 3, 4, 5]})
>>> generator = WhereGenerator(database)
>>> generator._generate_where_categorical("table_name", ["colors"], sample_df)
>>> generator.sql_generated
{
    "sql_tags": ["WHERE-CAT-MOST-FREQUENT", "WHERE-CAT-LEAST-FREQUENT",
                'WHERE-CAT-MOST-FREQUENT', 'WHERE-CAT-LEAST-FREQUENT',
                "WHERE-NOT-MOST-FREQUENT", "WHERE-NOT-LEAST-FREQUENT"],
    "queries": [
        'SELECT * FROM "table_name" WHERE "colors" == "blue"',
        'SELECT * FROM "table_name" WHERE "colors" == "green"',
        'SELECT * FROM "table_name" WHERE "colors" != "blue"',
        'SELECT * FROM "table_name" WHERE "colors" != "green"',
        'SELECT * FROM "table_name" WHERE NOT "colors" == "blue"',
        'SELECT * FROM "table_name" WHERE NOT "colors" == "green"',
    ],
    "questions": [
        'Show the data of the table "table_name" where "colors" is equal to blue',
        'Show the data of the table "table_name" where "colors" is equal to green',
        'Show the data of the table "table_name" where "colors" is different from blue',
        'Show the data of the table "table_name" where "colors" is different from green',
        'Show the data of the table "table_name" where "colors" is not equal to blue',
        'Show the data of the table "table_name" where "colors" is not equal to green',
    ]
}
>>> generator._generate_where_numerical("table_name", ["numbers"], sample_df)
>>> generator.sql_generated
{
    "sql_tags": ['WHERE-NUM-MAX-VALUES', 'WHERE-NUM-MIN-VALUES',
                'WHERE-NUM-MEAN-VALUES', 'WHERE-NUM-MEAN-VALUES'],
    "queries": ['SELECT * FROM "table_name" WHERE "numbers" < 5',
                'SELECT * FROM "table_name" WHERE "numbers" > 1,
                'SELECT * FROM "table_name" WHERE "numbers" > 3.0'
                'SELECT * FROM "table_name" WHERE "numbers" < 3.0'],
    "question": ['Show the data of the table "table_name" where "numbers" is less than 5',
                'Show the data of the table "table_name" where "numbers" is greater than 1',
                'Show the data of the table "table_name" where "numbers" is greater than 3.0',
                'Show the data of the table "table_name" where "numbers" is less than 3.0'],
}
Source code in qatch/sql_generator/where_generator.py
def sql_generate(self, table_name: str) -> dict[str, list]:
    """
    Generates WHERE SQL queries and corresponding questions for both categorical and numerical columns.

    Args:
        table_name (str): The name of the table in the database.

    Returns:
        dict: A dictionary containing generated SQL tags, queries, questions, and results.
              Format: {"sql_tags": list[str], "queries": list[str], "questions": list[str]}

    Examples:
        Given a MultipleDatabases object "database" with a table "table_name" with columns "colors" and "numbers"
        >>> sample_df = pd.DataFrame({"colors": ["green", "blue", "blue", "blue", "blue"], "numbers": [1, 2, 3, 4, 5]})
        >>> generator = WhereGenerator(database)
        >>> generator._generate_where_categorical("table_name", ["colors"], sample_df)
        >>> generator.sql_generated
        {
            "sql_tags": ["WHERE-CAT-MOST-FREQUENT", "WHERE-CAT-LEAST-FREQUENT",
                        'WHERE-CAT-MOST-FREQUENT', 'WHERE-CAT-LEAST-FREQUENT',
                        "WHERE-NOT-MOST-FREQUENT", "WHERE-NOT-LEAST-FREQUENT"],
            "queries": [
                'SELECT * FROM "table_name" WHERE "colors" == "blue"',
                'SELECT * FROM "table_name" WHERE "colors" == "green"',
                'SELECT * FROM "table_name" WHERE "colors" != "blue"',
                'SELECT * FROM "table_name" WHERE "colors" != "green"',
                'SELECT * FROM "table_name" WHERE NOT "colors" == "blue"',
                'SELECT * FROM "table_name" WHERE NOT "colors" == "green"',
            ],
            "questions": [
                'Show the data of the table "table_name" where "colors" is equal to blue',
                'Show the data of the table "table_name" where "colors" is equal to green',
                'Show the data of the table "table_name" where "colors" is different from blue',
                'Show the data of the table "table_name" where "colors" is different from green',
                'Show the data of the table "table_name" where "colors" is not equal to blue',
                'Show the data of the table "table_name" where "colors" is not equal to green',
            ]
        }
        >>> generator._generate_where_numerical("table_name", ["numbers"], sample_df)
        >>> generator.sql_generated
        {
            "sql_tags": ['WHERE-NUM-MAX-VALUES', 'WHERE-NUM-MIN-VALUES',
                        'WHERE-NUM-MEAN-VALUES', 'WHERE-NUM-MEAN-VALUES'],
            "queries": ['SELECT * FROM "table_name" WHERE "numbers" < 5',
                        'SELECT * FROM "table_name" WHERE "numbers" > 1,
                        'SELECT * FROM "table_name" WHERE "numbers" > 3.0'
                        'SELECT * FROM "table_name" WHERE "numbers" < 3.0'],
            "question": ['Show the data of the table "table_name" where "numbers" is less than 5',
                        'Show the data of the table "table_name" where "numbers" is greater than 1',
                        'Show the data of the table "table_name" where "numbers" is greater than 3.0',
                        'Show the data of the table "table_name" where "numbers" is less than 3.0'],
        }
    """
    self.empty_sql_generated()
    df, cat_cols, num_cols = self._sample_cat_num_cols(table_name)
    self._generate_where_categorical(table_name, cat_cols, df)
    self._generate_where_numerical(table_name, num_cols, df)
    return self.sql_generated