From 54b2f71215c864aa96f7424a7fb57d984de8c7a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=B8=80=E6=8C=83?= <yizhi.fzh@alibaba-inc.com>
Date: Mon, 9 Aug 2021 19:46:13 +0800
Subject: [PATCH v4 6/6] Maintain the UniqueKey on Subquery and UpperRel level.

---
 src/backend/optimizer/path/allpaths.c   |   2 +
 src/backend/optimizer/path/pathkeys.c   |   3 +-
 src/backend/optimizer/path/uniquekey.c  | 179 ++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c    |  17 ++-
 src/include/optimizer/paths.h           |   6 +
 src/test/regress/expected/uniquekey.out |  36 +++++
 src/test/regress/sql/uniquekey.sql      |  21 +++
 7 files changed, 261 insertions(+), 3 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 343253d694..9225bf738e 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -2288,6 +2288,8 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
 		return;
 	}
 
+	populate_subquery_uniquekeys(root, rel, sub_final_rel);
+
 	/*
 	 * Mark rel with estimated output rows, width, etc.  Note that we have to
 	 * do this before generating outer-query paths, else cost_subqueryscan is
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 21f4f2d4bc..9a8a5b654c 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -33,7 +33,6 @@ static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
 static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
 											 RelOptInfo *partrel,
 											 int partkeycol);
-static Var *find_var_for_subquery_tle(RelOptInfo *rel, TargetEntry *tle);
 static bool right_merge_direction(PlannerInfo *root, PathKey *pathkey);
 
 
@@ -1035,7 +1034,7 @@ convert_subquery_pathkeys(PlannerInfo *root, RelOptInfo *rel,
  * We need this to ensure that we don't return pathkeys describing values
  * that are unavailable above the level of the subquery scan.
  */
-static Var *
+Var *
 find_var_for_subquery_tle(RelOptInfo *rel, TargetEntry *tle)
 {
 	ListCell   *lc;
diff --git a/src/backend/optimizer/path/uniquekey.c b/src/backend/optimizer/path/uniquekey.c
index 815ade02c3..a073563c59 100644
--- a/src/backend/optimizer/path/uniquekey.c
+++ b/src/backend/optimizer/path/uniquekey.c
@@ -47,6 +47,13 @@ static void populate_joinrel_composite_uniquekey(PlannerInfo *root,
 												 bool outeruk_still_valid,
 												 bool inneruk_still_valid);
 
+static void convert_subquery_uniquekey(PlannerInfo *root, RelOptInfo *rel, UniqueKey *sub_ukey);
+static EquivalenceClass * find_outer_ec_with_subquery_em(PlannerInfo *root, RelOptInfo *rel,
+														 EquivalenceClass *sub_ec,
+														 EquivalenceMember *sub_em);
+static List *convert_subquery_eclass_list(PlannerInfo *root, RelOptInfo *rel,
+										  List *sub_eclass_list);
+
 /* UniqueKey is subset of .. */
 static bool uniquekey_contains_in(PlannerInfo *root, UniqueKey *ukey,
 								  List *ecs, Relids relids);
@@ -185,6 +192,57 @@ populate_joinrel_uniquekeys(PlannerInfo *root, RelOptInfo *joinrel,
 	return;
 }
 
+/*
+ * populate_subquery_uniquekeys
+ *
+ * 'rel': outer query's RelOptInfo for the subquery relation.
+ * 'subquery_uniquekeys': the subquery's output pathkeys, in its terms.
+ * 'subquery_tlist': the subquery's output targetlist, in its terms.
+ *
+ *  subquery issues: a). tlist mapping.  b). interesting uniquekey. c). not nulls.
+ */
+void
+populate_subquery_uniquekeys(PlannerInfo *root, RelOptInfo *rel, RelOptInfo *sub_final_rel)
+{
+	List	*sub_uniquekeys = sub_final_rel->uniquekeys;
+	ListCell	*lc;
+	foreach(lc, sub_uniquekeys)
+	{
+		UniqueKey *sub_ukey = lfirst_node(UniqueKey, lc);
+		convert_subquery_uniquekey(root, rel, sub_ukey);
+	}
+}
+
+/*
+ * populate_uniquekeys_from_pathkeys
+ *
+ */
+void
+populate_uniquekeys_from_pathkeys(PlannerInfo *root, RelOptInfo *rel, List *pathkeys)
+{
+	ListCell *lc;
+	List	*unique_exprs = NIL;
+	if (pathkeys == NIL)
+		return;
+	foreach(lc, pathkeys)
+	{
+		PathKey *pathkey = lfirst(lc);
+		unique_exprs = lappend(unique_exprs, pathkey->pk_eclass);
+	}
+	rel->uniquekeys = list_make1(
+		make_uniquekey(bms_make_singleton(list_length(root->unique_exprs)),
+					   false,
+					   true));
+	root->unique_exprs = lappend(root->unique_exprs, unique_exprs);
+}
+
+
+void
+simple_copy_uniquekeys(RelOptInfo *tarrel, RelOptInfo *srcrel)
+{
+	tarrel->uniquekeys = srcrel->uniquekeys;
+}
+
 /*
  * relation_is_distinct_for
  *		Check if the relation is distinct for.
@@ -712,6 +770,127 @@ is_uniquekey_useful_afterjoin(PlannerInfo *root, UniqueKey *ukey,
 	return true;
 }
 
+/*
+ * find_outer_ec_with_subquery_em
+ *
+ *	Given a em in subquery, return the related EquivalenceClass outside.
+ */
+static EquivalenceClass *
+find_outer_ec_with_subquery_em(PlannerInfo *root, RelOptInfo *rel,
+							   EquivalenceClass *sub_ec, EquivalenceMember *sub_em)
+{
+	TargetEntry *sub_tle;
+	Var *outer_var;
+	EquivalenceClass *outer_ec;
+
+	sub_tle = get_tle_from_expr(sub_em->em_expr, rel->subroot->processed_tlist);
+
+	if (!sub_tle)
+		return NULL;
+
+	outer_var = find_var_for_subquery_tle(rel, sub_tle);
+	if (!outer_var)
+		return NULL;
+
+	outer_ec = get_eclass_for_sort_expr(root,
+										(Expr *)outer_var,
+										NULL,
+										sub_ec->ec_opfamilies,
+										sub_em->em_datatype,
+										sub_ec->ec_collation,
+										0,
+										rel->relids,
+										false);
+	return outer_ec;
+}
+
+
+/*
+ * convert_subquery_eclass_list
+ *
+ *		Given a list of eclass in subquery, find the corresponding eclass in outer side.
+ * return NULL if no related eclass outside is found for any eclass in subquery.
+ */
+static List *
+convert_subquery_eclass_list(PlannerInfo *root, RelOptInfo *rel, List *sub_eclass_list)
+{
+	ListCell	*lc;
+	List	*ec_list = NIL;
+	foreach(lc, sub_eclass_list)
+	{
+		EquivalenceClass *sub_ec = lfirst_node(EquivalenceClass, lc);
+		EquivalenceClass *ec = NULL;
+		ListCell	*emc;
+		foreach(emc, sub_ec->ec_members)
+		{
+			EquivalenceMember *sub_em = lfirst(emc);
+			if ((ec = find_outer_ec_with_subquery_em(root, rel, sub_ec, sub_em)) != NULL)
+				break;
+		}
+		if (!ec)
+			return NIL;
+		ec_list = lappend(ec_list, ec);
+	}
+	return ec_list;
+}
+
+
+/*
+ * convert_subquery_uniquekey
+ *
+ */
+static void
+convert_subquery_uniquekey(PlannerInfo *root, RelOptInfo *rel, UniqueKey *sub_ukey)
+{
+	PlannerInfo *sub_root = rel->subroot;
+	List	*unique_exprs_list = NIL;
+	Bitmapset	*unique_exprs_indexes = NULL;
+	UniqueKey	*ukey = NULL;
+	int i = -1;
+	ListCell	*lc;
+	while((i = bms_next_member(sub_ukey->unique_expr_indexes, i)) >= 0)
+	{
+		Node *sub_eq_list = list_nth(sub_root->unique_exprs, i);
+		if (IsA(sub_eq_list, SingleRow))
+		{
+			/*
+			 * TODO: Unclear what to do, don't think it hard before the overall
+			 * design is accepted.
+			 */
+			return;
+		}
+		else
+		{
+			List *upper_eq_list;
+			Assert(IsA(sub_eq_list, List));
+			/*
+			 * Note: upper_eq_list is just part of uniquekey's exprs, to covert the whole
+			 * UniqueKey, we needs all the parts are shown in the upper rel.
+			 */
+			upper_eq_list = convert_subquery_eclass_list(root, rel, (List *)sub_eq_list);
+			if (upper_eq_list == NIL)
+			{
+				if (unique_exprs_list != NIL)
+					pfree(unique_exprs_list);
+				return;
+			}
+			unique_exprs_list = lappend(unique_exprs_list, upper_eq_list);
+		}
+	}
+
+	foreach(lc, unique_exprs_list)
+	{
+		unique_exprs_indexes = bms_add_member(unique_exprs_indexes, list_length(root->unique_exprs));
+		root->unique_exprs = lappend(root->unique_exprs, lfirst(lc));
+	}
+
+	ukey = make_uniquekey(unique_exprs_indexes,
+						  sub_ukey->multi_nulls,
+						  /* TODO: need check again, case SELECT * FROM (SELECT u FROM x OFFSET 0) v where x.u = 0; */
+						  true);
+	rel->uniquekeys = lappend(rel->uniquekeys, ukey);
+}
+
 /*
  *	make_uniquekey
  */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index ed2cff00fc..a268737a4c 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1650,7 +1650,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 	 * Now we are prepared to build the final-output upperrel.
 	 */
 	final_rel = fetch_upper_rel(root, UPPERREL_FINAL, NULL);
-
+	simple_copy_uniquekeys(final_rel, current_rel);
 	/*
 	 * If the input rel is marked consider_parallel and there's nothing that's
 	 * not parallel-safe in the LIMIT clause, then the final_rel can be marked
@@ -3622,6 +3622,19 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
 									  gd,
 									  extra->targetList);
 
+	if (root->parse->groupingSets)
+	{
+		/* nothing to do */
+	}
+	else if (root->parse->groupClause && root->group_pathkeys != NIL)
+	{
+		populate_uniquekeys_from_pathkeys(root, grouped_rel, root->group_pathkeys);
+	}
+	else
+	{
+		/* SingleRow Case */
+	}
+
 	/* Build final grouping paths */
 	add_paths_to_grouping_rel(root, input_rel, grouped_rel,
 							  partially_grouped_rel, agg_costs, gd,
@@ -4251,6 +4264,8 @@ create_distinct_paths(PlannerInfo *root,
 	/* For now, do all work in the (DISTINCT, NULL) upperrel */
 	distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL);
 
+	populate_uniquekeys_from_pathkeys(root, distinct_rel, root->distinct_pathkeys);
+
 	/*
 	 * We don't compute anything at this level, so distinct_rel will be
 	 * parallel-safe if the input rel is parallel-safe.  In particular, if
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index f233837e59..b9570807af 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -265,11 +265,17 @@ extern void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 
 extern void populate_baserel_uniquekeys(PlannerInfo *root,
 										RelOptInfo *baserel);
+extern Var *find_var_for_subquery_tle(RelOptInfo *rel, TargetEntry *tle);
 extern void populate_baserel_uniquekeys(PlannerInfo *root,
 										RelOptInfo *baserel);
 extern void populate_joinrel_uniquekeys(PlannerInfo *root, RelOptInfo *joinrel,
 										RelOptInfo *outerrel, RelOptInfo *innerrel,
 										List *restrictlist, JoinType jointype);
+extern void populate_uniquekeys_from_pathkeys(PlannerInfo *root, RelOptInfo *rel,
+											  List *pathkeys);
+extern void populate_subquery_uniquekeys(PlannerInfo *root, RelOptInfo *rel,
+										 RelOptInfo *sub_final_rel);
+extern void simple_copy_uniquekeys(RelOptInfo *srcrel, RelOptInfo *tarrel);
 extern bool relation_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
 									 List *distinct_pathkey);
 #endif							/* PATHS_H */
diff --git a/src/test/regress/expected/uniquekey.out b/src/test/regress/expected/uniquekey.out
index c2bd1fa619..a86e413fcf 100644
--- a/src/test/regress/expected/uniquekey.out
+++ b/src/test/regress/expected/uniquekey.out
@@ -407,3 +407,39 @@ EXPLAIN (COSTS OFF) SELECT DISTINCT uqk1.d FROM uqk1, uqk2 WHERE uqk1.pk = 1 AND
                ->  Seq Scan on uqk2
 (8 rows)
 
+-----------------------------------------
+-- Test DISTINCT/GROUP BY CASE.
+-----------------------------------------
+--------------------------------------------------------------------------------------------
+-- Test subquery cases.
+-- Note that current the UniqueKey still not push down the interesting UniqueKey to subquery.
+-- like uniquekey, so the below test case need a "DISTINCT" in subquery to make sure the
+-- UniqueKey is maintain.
+--------------------------------------------------------------------------------------------
+-- Test a normal case - one side
+EXPLAIN SELECT DISTINCT v.* FROM
+(SELECT DISTINCT uqk1.c, uqk1.d FROM uqk1, uqk2
+WHERE uqk1.a = uqk2.pk AND uqk1.c is not null offset 0) v;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Hash Join  (cost=1.07..2.14 rows=2 width=8)
+   Hash Cond: (uqk1.a = uqk2.pk)
+   ->  Seq Scan on uqk1  (cost=0.00..1.05 rows=3 width=12)
+         Filter: (c IS NOT NULL)
+   ->  Hash  (cost=1.03..1.03 rows=3 width=4)
+         ->  Seq Scan on uqk2  (cost=0.00..1.03 rows=3 width=4)
+(6 rows)
+
+-- Test a normal case - composited side.
+EXPLAIN SELECT DISTINCT v.* FROM
+(SELECT DISTINCT t1.c, t1.d, t2.pk FROM uqk1 t1 cross join uqk2 t2 where t1.c is not null OFFSET 0)
+v;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Nested Loop  (cost=0.00..2.20 rows=9 width=12)
+   ->  Seq Scan on uqk1 t1  (cost=0.00..1.05 rows=3 width=8)
+         Filter: (c IS NOT NULL)
+   ->  Materialize  (cost=0.00..1.04 rows=3 width=4)
+         ->  Seq Scan on uqk2 t2  (cost=0.00..1.03 rows=3 width=4)
+(5 rows)
+
diff --git a/src/test/regress/sql/uniquekey.sql b/src/test/regress/sql/uniquekey.sql
index 3f93872246..22e236eafb 100644
--- a/src/test/regress/sql/uniquekey.sql
+++ b/src/test/regress/sql/uniquekey.sql
@@ -106,3 +106,24 @@ SELECT uqk1.c, uqk2.c FROM uqk1, uqk2 WHERE uqk1.pk = 2 AND uqk2.pk = 1 order BY
 -----------------------------------------
 EXPLAIN (COSTS OFF) SELECT DISTINCT uqk1.pk FROM uqk1, uqk2 WHERE uqk1.c = uqk2.c;
 EXPLAIN (COSTS OFF) SELECT DISTINCT uqk1.d FROM uqk1, uqk2 WHERE uqk1.pk = 1 AND uqk1.c = uqk2.c;
+
+-----------------------------------------
+-- Test DISTINCT/GROUP BY CASE.
+-----------------------------------------
+
+
+--------------------------------------------------------------------------------------------
+-- Test subquery cases.
+-- Note that current the UniqueKey still not push down the interesting UniqueKey to subquery.
+-- like uniquekey, so the below test case need a "DISTINCT" in subquery to make sure the
+-- UniqueKey is maintain.
+--------------------------------------------------------------------------------------------
+-- Test a normal case - one side
+EXPLAIN SELECT DISTINCT v.* FROM
+(SELECT DISTINCT uqk1.c, uqk1.d FROM uqk1, uqk2
+WHERE uqk1.a = uqk2.pk AND uqk1.c is not null offset 0) v;
+
+-- Test a normal case - composited side.
+EXPLAIN SELECT DISTINCT v.* FROM
+(SELECT DISTINCT t1.c, t1.d, t2.pk FROM uqk1 t1 cross join uqk2 t2 where t1.c is not null OFFSET 0)
+v;
-- 
2.21.0

