From 15433c382aa94cf1033926fa08018b865ff05535 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Mon, 27 May 2024 18:29:20 +0800
Subject: [PATCH 01/40] Add perfmon to hashdata-lightning

prefmon contains 3 parts:

- gpmon can collect query info, it mainly
use query_info_collect_hook to implement
that. Use it by loading gpmon.so into
postgres. It only runs on coordinator.

- gpsmon can collect system and disk info.
It is installed on each host of the cluster
(including the coordinator). It also accepts
and manages the query info sent from gpmon.

- gpmmon is also a so (gpmmon.so), should be
loaded into postgres too. And it starts and
stops with cluster. gpmmon sends command to
gpsmon every 'quantum' to collect the system
and query info. And dumps the data into
the tables in gpperfmon database.

gpmmon will read the gpperfmon.conf when
starting, and check the gpperfmon dabatabase
, connect to gpsmon

Tips: gpmmon uses libpq to connect to
gpperfmon database, so it is compiled as
FRONTENED. Will use SPI instead of libpq
to connect to db next version and combine
it with gpmon.so into one.
To build perfmon, configure the hashdata-lightning
with --enable-perfmon

gpperfmon_install, a python script, helps to create
gpperfmon database and related tables.
Also create the user gpmon. Beside that,
it uses gpconfig to set some gucs used
by perfom:

gpconfig -c perfmon.enable -v on
gpconfig -c perfmon.port -v 8888
gpconfig -c gp_external_enable_exec -v on --masteronly

perfmon.port is used to assign a port for gpsmon.

How to use perfmon?
- run gpperfmon_install
 gpperfmon_install --enable --port 7000 --password 123
- restart the cluster

perfmon.enable's default value is off, gpperfmon_install
will set it to true. When you want to disable the perfmon,
run "gpconfig -c perfmon.enable -v 'off'" and restart the
cluster.

TODO
- the regress test for it is not completed yet
- should rename gpperfmonxx to perfmonxx
---
 GNUmakefile.in                                |    6 +
 configure                                     |   42 +
 configure.ac                                  |   13 +
 contrib/perfmon/.gitignore                    |    7 +
 contrib/perfmon/Makefile                      |   42 +
 contrib/perfmon/README.md                     |   38 +
 contrib/perfmon/README_hashdata.md            |    9 +
 contrib/perfmon/expected/guc_config.out       |   72 +
 contrib/perfmon/expected/query.out            |    0
 contrib/perfmon/gpmon_catqrynow.py            |   48 +
 contrib/perfmon/gpperfmon.conf                |   48 +
 contrib/perfmon/gpperfmon.sql                 |  299 +++
 contrib/perfmon/gpperfmon_install             |  242 +++
 contrib/perfmon/gpperfmoncat.sh               |   50 +
 contrib/perfmon/sql/guc_config.sql            |   43 +
 contrib/perfmon/sql/query.sql                 |    9 +
 contrib/perfmon/src/common/gpmonlib.c         |  559 +++++
 contrib/perfmon/src/gpmmon/Makefile           |   18 +
 contrib/perfmon/src/gpmmon/gpmmon.c           | 1858 +++++++++++++++++
 contrib/perfmon/src/gpmmon/gpmon_agg.c        | 1556 ++++++++++++++
 contrib/perfmon/src/gpmmon/gpmon_agg.h        |   14 +
 contrib/perfmon/src/gpmmon/gpmondb.c          | 1729 +++++++++++++++
 contrib/perfmon/src/gpmmon/gpmondb.h          |   97 +
 contrib/perfmon/src/gpmon/Makefile            |   15 +
 contrib/perfmon/src/gpmon/gpmon.c             |  512 +++++
 contrib/perfmon/src/gpsmon/Makefile           |   37 +
 contrib/perfmon/src/gpsmon/gpsmon.c           | 1772 ++++++++++++++++
 contrib/perfmon/src/include/gpmon.h           |  290 +++
 contrib/perfmon/src/include/gpmonlib.h        |  247 +++
 src/Makefile.global.in                        |    1 +
 src/include/pg_config.h.in                    |    3 +
 .../utils/process_shared_preload_libraries.h  |    3 +
 32 files changed, 9679 insertions(+)
 create mode 100644 contrib/perfmon/.gitignore
 create mode 100644 contrib/perfmon/Makefile
 create mode 100644 contrib/perfmon/README.md
 create mode 100644 contrib/perfmon/README_hashdata.md
 create mode 100644 contrib/perfmon/expected/guc_config.out
 create mode 100644 contrib/perfmon/expected/query.out
 create mode 100644 contrib/perfmon/gpmon_catqrynow.py
 create mode 100644 contrib/perfmon/gpperfmon.conf
 create mode 100644 contrib/perfmon/gpperfmon.sql
 create mode 100755 contrib/perfmon/gpperfmon_install
 create mode 100755 contrib/perfmon/gpperfmoncat.sh
 create mode 100644 contrib/perfmon/sql/guc_config.sql
 create mode 100644 contrib/perfmon/sql/query.sql
 create mode 100644 contrib/perfmon/src/common/gpmonlib.c
 create mode 100644 contrib/perfmon/src/gpmmon/Makefile
 create mode 100644 contrib/perfmon/src/gpmmon/gpmmon.c
 create mode 100644 contrib/perfmon/src/gpmmon/gpmon_agg.c
 create mode 100644 contrib/perfmon/src/gpmmon/gpmon_agg.h
 create mode 100644 contrib/perfmon/src/gpmmon/gpmondb.c
 create mode 100644 contrib/perfmon/src/gpmmon/gpmondb.h
 create mode 100644 contrib/perfmon/src/gpmon/Makefile
 create mode 100644 contrib/perfmon/src/gpmon/gpmon.c
 create mode 100644 contrib/perfmon/src/gpsmon/Makefile
 create mode 100644 contrib/perfmon/src/gpsmon/gpsmon.c
 create mode 100644 contrib/perfmon/src/include/gpmon.h
 create mode 100644 contrib/perfmon/src/include/gpmonlib.h

diff --git a/GNUmakefile.in b/GNUmakefile.in
index e6333e39bec..c7c86cfed14 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -34,6 +34,9 @@ ifeq ($(with_openssl), yes)
 endif
 ifeq ($(enable_pax), yes)
 	$(MAKE) -C contrib/pax_storage all
+endif
+ifeq ($(enable_perfmon), yes)
+	$(MAKE) -C contrib/perfmon all
 endif
 	$(MAKE) -C gpMgmt all
 	$(MAKE) -C gpcontrib all
@@ -81,6 +84,9 @@ ifeq ($(enable_pax), yes)
 endif
 ifeq ($(with_openssl), yes)
 	$(MAKE) -C contrib/sslinfo $@
+endif
+ifeq ($(enable_perfmon), yes)
+	$(MAKE) -C contrib/perfmon $@
 endif
 	$(MAKE) -C gpMgmt $@
 	$(MAKE) -C gpcontrib $@
diff --git a/configure b/configure
index e612c658399..9217e3d369b 100755
--- a/configure
+++ b/configure
@@ -751,6 +751,7 @@ ICU_CFLAGS
 with_icu
 enable_thread_safety
 INCLUDES
+enable_perfmon
 enable_pax
 enable_preload_ic_module
 enable_ic_proxy
@@ -903,6 +904,7 @@ enable_external_fts
 enable_ic_proxy
 enable_preload_ic_module
 enable_pax
+enable_perfmon
 enable_thread_safety
 with_icu
 with_tcl
@@ -1619,6 +1621,7 @@ Optional Features:
   --disable-preload-ic-module
                           disable preload interconnect module
   --enable-pax            enable pax support
+  --enable-perfmon     	  enable preload gophermeta
   --disable-thread-safety disable thread-safety in client libraries
   --enable-openssl-redirect
                           enable redirect openssl interface to internal
@@ -9132,6 +9135,35 @@ fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: checking whether to build with pax support ... $enable_pax" >&5
 $as_echo "checking whether to build with pax support ... $enable_pax" >&6; }
 
+#
+# perfmon support
+#
+
+
+# Check whether --enable-perfmon was given.
+if test "${enable_perfmon+set}" = set; then :
+  enableval=$enable_perfmon;
+  case $enableval in
+    yes)
+
+$as_echo "#define USE_PERFMON 1" >>confdefs.h
+
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --enable-perfmon option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  enable_perfmon=no
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: checking whether to build with vectorization support ... $enable_perfmon" >&5
+$as_echo "checking whether to build with vectorization support ... $enable_perfmon" >&6; }
+
 #
 # Include directories
 #
@@ -22146,6 +22178,16 @@ fi
   CPPFLAGS=$ac_save_CPPFLAGS
 fi
 
+# check for apr
+if test "$enable_perfmon" = yes; then
+	ac_fn_c_check_header_mongrel "$LINENO" "apr_getopt.h" "ac_cv_header_apr_getopt_h" "$ac_includes_default"
+if test "x$ac_cv_header_apr_getopt_h" = xyes; then :
+
+else
+  as_fn_error $? "header file <apr_getopt.h> is required for perfmon" "$LINENO" 5
+fi
+fi
+
 #
 # Check for documentation-building tools
 #
diff --git a/configure.ac b/configure.ac
index a1e3d9709db..7c864e2d161 100644
--- a/configure.ac
+++ b/configure.ac
@@ -936,6 +936,15 @@ AC_MSG_RESULT([checking whether to build with PAX support ... $enable_pax])
 AC_SUBST(enable_pax)
 
 #
+# perfmon support
+#
+PGAC_ARG_BOOL(enable, perfmon, no,
+              [enable perfmon support],
+              [AC_DEFINE(USE_PERFMON, 1,
+                         [Define to 1 to support perfmon])])
+AC_MSG_RESULT([checking whether to build with perfmon support ... $enable_perfmon])
+AC_SUBST(enable_perfmon)
+
 # Include directories
 #
 ac_save_IFS=$IFS
@@ -2882,6 +2891,10 @@ if test "$with_python" = yes; then
   CPPFLAGS=$ac_save_CPPFLAGS
 fi
 
+# check for apr
+if test "$enable_perfmon" = yes; then
+  AC_CHECK_HEADER(apr_getopt.h, [], [AC_MSG_ERROR([header file <apr_getopt.h> is required for perfmon])])
+fi
 #
 # Check for documentation-building tools
 #
diff --git a/contrib/perfmon/.gitignore b/contrib/perfmon/.gitignore
new file mode 100644
index 00000000000..f201ec88242
--- /dev/null
+++ b/contrib/perfmon/.gitignore
@@ -0,0 +1,7 @@
+./gpsmon
+./gpmon.so
+./gpmmon.so
+./src/gpsmon/gpsmon
+./src/gpmmon/gpmmon.so
+./src/gpmon/gpmon.so
+./gpperfmon--?.?.?.sql
diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
new file mode 100644
index 00000000000..196ec42acf3
--- /dev/null
+++ b/contrib/perfmon/Makefile
@@ -0,0 +1,42 @@
+NAME = gpperfmon
+top_builddir = ../../
+REGRESS = guc_config query
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/perfmon
+include $(top_builddir)/src/Makefile.global
+endif
+
+clean distclean:
+	$(MAKE) -C src/gpmon clean
+	$(MAKE) -C src/gpmmon clean
+	$(MAKE) -C src/gpsmon clean
+	rm -rf gpmon.so
+	rm -rf gpsmon
+	rm -rf gpperfmon.so
+
+all:
+	$(MAKE) -C src/gpmon all
+	$(MAKE) -C src/gpmmon all
+	$(MAKE) -C src/gpsmon all
+
+installdirs:
+	$(MKDIR_P) '$(DESTDIR)$(bindir)/../sbin'
+install:  installdirs
+	$(MAKE) -C src/gpmon install
+	$(MAKE) -C src/gpmmon install
+	$(MAKE) -C src/gpsmon install
+	$(INSTALL_SCRIPT) gpperfmon_install '$(bindir)'
+	$(INSTALL_SCRIPT) gpperfmoncat.sh '$(DESTDIR)$(bindir)'
+	$(INSTALL_SCRIPT) gpmon_catqrynow.py '$(DESTDIR)$(bindir)/../sbin/'
+	mkdir -p '$(DESTDIR)$(libdir)/$(NAME)'
+	$(INSTALL_SCRIPT) $(NAME).sql '$(DESTDIR)$(libdir)/$(NAME)'
+	$(INSTALL_SCRIPT) $(NAME).conf '$(DESTDIR)$(libdir)/$(NAME)'
+
diff --git a/contrib/perfmon/README.md b/contrib/perfmon/README.md
new file mode 100644
index 00000000000..1f1414290e1
--- /dev/null
+++ b/contrib/perfmon/README.md
@@ -0,0 +1,38 @@
+# gpperfmon
+
+gpperfmon tracks a variety of queries, statistics, system properties, and metrics.
+
+Find more information about the architecture on [the wiki page](https://github.com/greenplum-db/gpdb/wiki/Gpperfmon-Overview)
+
+## Libraries Required
+
+### libsigar:
+	https://github.com/hyperic/sigar
+	For CentOS 6:
+		yum install sigar-devel
+	For macOS and CentOS 7 (and others):
+	    Use this updated fork: https://github.com/boundary/sigar
+		to build:
+	    `mkdir build && cd build && cmake .. && make && make install`
+
+## Troubleshooting
+	For macOS:
+		You may hit a reverse look up issue when viewing the logs of gpperfmon by default $MASTER_DATA_DIRECTORY/gpperfmon/logs
+```
+	2017-04-11 14:59:56.821681
+	PDT,"gpmon","gpperfmon",p40501,th-1633193024,"::1","54006",2017-04-11 14:59:56
+	PDT,0,con5,,seg-1,,,,sx1,"FATAL","28000","no pg_hba.conf entry for host
+	""::1"", user ""gpmon"", database ""gpperfmon""",,,,,,,0,,"auth.c",608, ```
+```
+	And also issues at $MASTER_DATA_DIRECTORY/pg_log:
+```
+		Performance Monitor - failed to connect to gpperfmon database: could not connect to server: No such file or directory
+	Is the server running locally and accepting
+	connections on Unix domain socket ""/var/pgsql_socket/.s.PGSQL.15432""?",,,,,,,,"SysLoggerMain","syslogger.c",618,
+```
+		to get pass this you need to do 2 things:
+			1) export PGHOST=foo  # where 'foo' is your hostname that is NOT localhost
+			2) sudo /etc/hosts   # and separate out (re)definitions of 127.0.0.1, something like:
+				127.0.0.1	foo
+				127.0.0.1	localhost
+
diff --git a/contrib/perfmon/README_hashdata.md b/contrib/perfmon/README_hashdata.md
new file mode 100644
index 00000000000..01b44fa43ae
--- /dev/null
+++ b/contrib/perfmon/README_hashdata.md
@@ -0,0 +1,9 @@
+1. gp_elog and guc:'gpperfmon_log_alert_level' have been
+removed in hashdata-lightning
+	- disable check_disk_space
+	- disable message_main
+	- disable gpdb_import_alert_log
+2. load gpmon as a shared library
+	- disable parse_command_line.
+	- get opt.port and opt.conf_file by xx
+	- modify the Makefile and gpperfmon_install
diff --git a/contrib/perfmon/expected/guc_config.out b/contrib/perfmon/expected/guc_config.out
new file mode 100644
index 00000000000..11e798c6420
--- /dev/null
+++ b/contrib/perfmon/expected/guc_config.out
@@ -0,0 +1,72 @@
+-- Disable perfmon.enable
+-- start_ignore
+\! gpconfig -c perfmon.enable -v false
+\! gpstop -ari
+-- end_ignore
+\! ps -ef | grep '\[gpmmon\]' | wc -l
+0
+\c gpperfmon
+show perfmon.enable;
+ perfmon.enable 
+----------------
+ off
+(1 row)
+
+-- start_ignore
+\! gpconfig -c perfmon.enable -v true 
+\! gpstop -ari
+-- end_ignore
+\! ps -ef | grep '\[gpmmon\]' | wc -l
+1
+\c gpperfmon
+show perfmon.enable;
+ perfmon.enable 
+----------------
+ on
+(1 row)
+
+show perfmon.port;
+ perfmon.port 
+--------------
+ 8848
+(1 row)
+
+CREATE OR REPLACE FUNCTION wait_for_gpsmon_work() RETURNS void AS $$
+DECLARE
+start_time timestamptz := clock_timestamp();
+updated bool;
+starttime timestamptz;
+BEGIN
+	select COALESCE(ctime,CURRENT_TIMESTAMP) from diskspace_now into starttime;
+	-- we don't want to wait forever; loop will exit after 60 seconds
+	FOR i IN 1 .. 600 LOOP
+		SELECT(SELECT count(*) > 0 from diskspace_now 
+			WHERE ctime > starttime) INTO updated;
+		EXIT WHEN updated;
+
+		-- wait a little
+		PERFORM pg_sleep_for('100 milliseconds');
+	END LOOP;
+	-- report time waited in postmaster log (where it won't change test output)
+	RAISE log 'wait_for_gpsmon_work delayed % seconds',
+	EXTRACT(epoch FROM clock_timestamp() - start_time);
+END
+$$ LANGUAGE plpgsql;
+select wait_for_gpsmon_work(); 
+ wait_for_gpsmon_work 
+----------------------
+ 
+(1 row)
+
+select count(*) from diskspace_now;
+ count 
+-------
+     1
+(1 row)
+
+\! netstat -anp | grep udp | grep gpsmon | wc -l
+(Not all processes could be identified, non-owned process info
+ will not be shown, you would have to be root to see it all.)
+1
+\! ps -ef | grep gpsmon | grep -v grep | wc -l
+1
diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/contrib/perfmon/gpmon_catqrynow.py b/contrib/perfmon/gpmon_catqrynow.py
new file mode 100644
index 00000000000..676889b8e79
--- /dev/null
+++ b/contrib/perfmon/gpmon_catqrynow.py
@@ -0,0 +1,48 @@
+import os, sys, time
+
+GPMONDIR = 'gpperfmon/data'
+
+
+# for each line in queries_now.dat 
+#	open the q{tmid}-{xid}-{cid}.txt file to retrieve query text/ query plan
+#	append to line
+#	print the line
+
+for line in open(os.path.join(GPMONDIR, "queries_now.dat")):
+    line = line.split('|')
+    (tmid, xid, cid) = line[1:4]
+    qrytxt = ''
+    appname = ''
+    rsqname = ''
+    priority = ''
+    fp = None
+    try:
+        fp = open(os.path.join(GPMONDIR, "q%s-%s-%s.txt" % (tmid, xid, cid)), 'r')
+        meta = fp.readline().split(' ')
+        qrytxt = fp.read(int(meta[0])).strip()
+
+        newline = fp.readline()
+        meta = fp.readline().split(' ')
+        appname = fp.read(int(meta[0])).strip()
+        
+        newline = fp.readline()
+        meta = fp.readline().split(' ')
+        rsqname = fp.read(int(meta[0])).strip()
+        
+        newline = fp.readline()
+        meta = fp.readline().split(' ')
+        priority = fp.read(int(meta[0])).strip()
+
+        fp.close()
+    except:
+        qrytxt = "Query text unavailable"
+        if fp: fp.close()
+
+    # escape all " with ""
+    if qrytxt:
+        qrytxt = '""'.join(qrytxt.split('"'))
+        line[-5] = '"' + qrytxt + '"'
+        line[-3] = '"' + appname + '"'
+        line[-2] = '"' + rsqname + '"'
+        line[-1] = '"' + priority + '"'
+    print '|'.join(line).strip()
diff --git a/contrib/perfmon/gpperfmon.conf b/contrib/perfmon/gpperfmon.conf
new file mode 100644
index 00000000000..e8bc342001e
--- /dev/null
+++ b/contrib/perfmon/gpperfmon.conf
@@ -0,0 +1,48 @@
+[GPMMON]
+# quantum specifies the time in seconds between updates from
+# performance monitor agents on all segments. Valid values
+# are 5, 10, 15, 20, 30, or 60
+quantum = 15
+
+# min_query_time specifies the minimum query run time
+# in seconds for statistics collection. The monitor logs all
+# queries that run longer than this value in the queries_history
+# table. For queries with shorter run times, no historical
+# data is collected.
+min_query_time = 20
+
+# This should be a percentage between 0 and 100 and should be
+# less than the error_disk_space_percentage.  If a filesystem's
+# disk space used percentage equals or exceeds this value a
+# warning will be logged and a warning email/snmp trap may be
+# sent.  If this configuration is set to 0 or not specified, no
+# warnings are sent.
+#warning_disk_space_percentage = 80
+
+# This should be a percentage between 0 and 100 and should be
+# greater than the warning_disk_space_percentage. If a
+# filesystem's disk space used percentage equals or exceeds
+# this value an error will be logged and a error email/snmp
+# trap may be sent.  If this configuration is set to 0 or not
+# specified, no errors are sent.
+#error_disk_space_percentage = 90
+
+gThis is the interval in minutes that limits the number of
+#error/warning messages that are sent. The minimum value for
+#this configuration is 1.  Setting this to 0 or not specifying
+#this configuration results in it getting set to the minimum.
+disk_space_interval = 60
+
+#This is the maximum number of error/warning messages that
+#will be sent in the disk_space_interval.  The maximum value
+#for this configuration is 50.  The minimum value for this
+#configuration is 1.  Setting this configuration to greater
+#than 50 or not specifying this configuration results in it
+#getting set to the maximum.
+max_disk_space_messages_per_interval = 10
+
+# The number of partitions for statistics data in month
+# will be retained. Older partitions will be dropped.
+#partition_age = 6
+
+log_location = gpperfmon/logs
diff --git a/contrib/perfmon/gpperfmon.sql b/contrib/perfmon/gpperfmon.sql
new file mode 100644
index 00000000000..81f93a7a1c6
--- /dev/null
+++ b/contrib/perfmon/gpperfmon.sql
@@ -0,0 +1,299 @@
+--  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+--  Gpperfmon Schema
+
+-- Note: In 4.x, this file was run as part of upgrade (in single user mode).
+-- Therefore, we could not make use of psql escape sequences such as
+-- "\c gpperfmon" and every statement had to be on a single line.
+--
+-- Violating the above _would_ break 4.x upgrades.
+--
+
+--  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+--  system
+--
+\c gpperfmon;
+
+create table public.system_history (
+       ctime timestamptz(0) not null, -- record creation time
+       hostname varchar(64) not null, -- hostname of system this metric belongs to
+       mem_total bigint not null, mem_used bigint not null, -- total system memory
+       mem_actual_used bigint not null, mem_actual_free bigint not null, -- memory used
+       swap_total bigint not null, swap_used bigint not null, -- total swap space
+       swap_page_in bigint not null, swap_page_out bigint not null, -- swap pages in
+       cpu_user float not null, cpu_sys float not null, cpu_idle float not null, -- cpu usage
+       load0 float not null, load1 float not null, load2 float not null, -- cpu load avgs
+       quantum int not null, -- interval between metric collection for this entry
+       disk_ro_rate bigint not null, -- system disk read ops per second
+       disk_wo_rate bigint not null, -- system disk write ops per second
+       disk_rb_rate bigint not null, -- system disk read bytes per second
+       disk_wb_rate bigint not null, -- system disk write bytes per second
+       net_rp_rate bigint not null,  -- system net read packets per second
+       net_wp_rate bigint not null,  -- system net write packets per second
+       net_rb_rate bigint not null,  -- system net read bytes per second
+       net_wb_rate bigint not null   -- system net write bytes per second
+) 
+with (fillfactor=100)
+distributed by (ctime)
+partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+
+create external web table public.system_now (
+       like public.system_history
+) execute 'cat gpperfmon/data/system_now.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+create external web table public.system_tail (
+       like public.system_history
+) execute 'cat gpperfmon/data/system_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+create external web table public._system_tail (
+        like public.system_history
+) execute 'cat gpperfmon/data/_system_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+--  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+--  queries
+--
+
+create table public.queries_history (
+       ctime timestamptz(0), -- record creation time
+       tmid int not null,  -- time id
+       ssid int not null,    -- session id
+       ccnt int not null,    -- command count in session
+	   pid  int not null,
+       username varchar(64) not null, -- username that issued the query
+	   dbid	oid not null,	-- database oid for the query
+       cost int not null, -- query cost (not implemented)
+       tsubmit timestamptz(0) not null, -- query submit time
+       tstart timestamptz(0),  -- query start time
+       tfinish timestamptz(0) not null,    -- query end time
+       status varchar(64) not null,   -- query status (start, end, abort)
+       rows_out bigint not null, -- rows out for query
+       cpu_elapsed bigint not null, -- cpu usage for query across all segments
+       cpu_currpct float not null, -- current cpu percent avg for all processes executing query
+       skew_cpu float not null,    -- coefficient of variance for cpu_elapsed of iterators across segments for query
+       skew_rows float not null,   -- coefficient of variance for rows_in of iterators across segments for query
+       query_hash bigint not null, -- (not implemented)
+       query_text text not null default '', -- query text
+       query_plan text not null default '', -- query plan (not implemented)
+       application_name varchar(64), -- from 4.2 onwards
+       rsqname varchar(64),          -- from 4.2 onwards
+       rqppriority varchar(16)       -- from 4.2 onwards
+)
+with (fillfactor=100)
+distributed by (ctime)
+partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+
+
+create external web table public.queries_now (
+        like public.queries_history
+) execute 'python $GPHOME/sbin/gpmon_catqrynow.py 2> /dev/null || true' on master format 'csv' (delimiter '|' NULL as 'null');
+
+create external web table public.queries_now_fast (
+       ctime timestamptz(0),
+       tmid int,
+       ssid int,    -- gp_session_id
+       ccnt int,    -- gp_command_count
+       username varchar(64),
+	   dbid	oid,
+       cost int,
+       tsubmit timestamptz(0), 
+       tstart timestamptz(0), 
+       tfinish timestamptz(0),
+       status varchar(64),
+       rows_out bigint,
+       cpu_elapsed bigint,
+       cpu_currpct float,
+       skew_cpu float,		-- always 0
+       skew_rows float
+       -- excluded: query_text text
+       -- excluded: query_plan text
+) execute 'cat gpperfmon/data/queries_now.dat 2> /dev/null || true' on master format 'csv' (delimiter '|' NULL as 'null');
+
+create external web table public.queries_tail (
+        like public.queries_history
+) execute 'cat gpperfmon/data/queries_tail.dat 2> /dev/null || true' on master format 'csv' (delimiter '|' NULL as 'null');
+
+
+create external web table public._queries_tail (
+        like public.queries_history
+) execute 'cat gpperfmon/data/_queries_tail.dat 2> /dev/null || true' on master format 'csv' (delimiter '|' NULL as 'null');
+
+--  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+--  database
+--
+
+create table public.database_history (
+       ctime timestamptz(0) not null, -- record creation time
+       queries_total int not null, -- total number of queries
+       queries_running int not null, -- number of running queries
+       queries_queued int not null -- number of queued queries
+) 
+with (fillfactor=100)
+distributed by (ctime)
+partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+
+create external web table public.database_now (
+       like public.database_history
+) execute 'cat gpperfmon/data/database_now.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+create external web table public.database_tail (
+       like public.database_history
+) execute 'cat gpperfmon/data/database_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+create external web table public._database_tail (
+        like public.database_history
+) execute 'cat gpperfmon/data/_database_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+create external web table public.master_data_dir (hostname text, dir text)
+execute E'python -c "import socket, os; print socket.gethostname() + \\"|\\" + os.getcwd()"' on master
+format 'csv' (delimiter '|');
+
+
+--  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+--  Web API views
+--
+
+-- TABLE: segment_history
+--   ctime                      record creation time
+--   dbid                       segment database id
+--   hostname                   hostname of system this metric belongs to
+--   dynamic_memory_used        bytes of dynamic memory used by the segment
+--   dynamic_memory_available   bytes of dynamic memory available for use by the segment
+create table public.segment_history (ctime timestamptz(0) not null, dbid int not null, hostname varchar(64) not null, dynamic_memory_used bigint not null, dynamic_memory_available bigint not null) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+
+-- TABLE: segment_now
+--   (like segment_history)
+create external web table public.segment_now (like public.segment_history) execute 'cat gpperfmon/data/segment_now.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+-- TABLE: segment_tail
+--   (like segment_history)
+create external web table public.segment_tail (like public.segment_history) execute 'cat gpperfmon/data/segment_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: _segment_tail
+--   (like segment_history)
+create external web table public._segment_tail (like public.segment_history) execute 'cat gpperfmon/data/_segment_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+DROP VIEW IF EXISTS public.memory_info;
+DROP VIEW IF EXISTS public.dynamic_memory_info;
+
+-- VIEW: dynamic_memory_info
+CREATE VIEW public.dynamic_memory_info as select public.segment_history.ctime, public.segment_history.hostname, round(sum(public.segment_history.dynamic_memory_used)/1024/1024, 2) AS dynamic_memory_used_mb, round(sum(public.segment_history.dynamic_memory_available)/1024/1024, 2) AS dynamic_memory_available_mb FROM public.segment_history GROUP BY public.segment_history.ctime, public.segment_history.hostname;
+
+-- VIEW: memory_info
+CREATE VIEW public.memory_info as select public.system_history.ctime, public.system_history.hostname, round(public.system_history.mem_total/1024/1024, 2) as mem_total_mb, round(public.system_history.mem_used/1024/1024, 2) as mem_used_mb, round(public.system_history.mem_actual_used/1024/1024, 2) as mem_actual_used_mb, round(public.system_history.mem_actual_free/1024/1024, 2) as mem_actual_free_mb, round(public.system_history.swap_total/1024/1024, 2) as swap_total_mb, round(public.system_history.swap_used/1024/1024, 2) as swap_used_mb, dynamic_memory_info.dynamic_memory_used_mb as dynamic_memory_used_mb, dynamic_memory_info.dynamic_memory_available_mb as dynamic_memory_available_mb FROM public.system_history, dynamic_memory_info WHERE public.system_history.hostname = dynamic_memory_info.hostname AND public.system_history.ctime = public.dynamic_memory_info.ctime;
+
+
+-- TABLE: diskspace_history
+--   ctime                      time of measurement
+--   hostname                   hostname of measurement
+--   filesytem                  name of filesystem for measurement
+--   total_bytes                bytes total in filesystem
+--   bytes_used                 bytes used in the filesystem
+--   bytes_available            bytes available in the filesystem
+create table public.diskspace_history (ctime timestamptz(0) not null, hostname varchar(64) not null, filesystem text not null, total_bytes bigint not null, bytes_used bigint not null, bytes_available bigint not null) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+
+--- TABLE: diskspace_now
+--   (like diskspace_history)
+create external web table public.diskspace_now (like public.diskspace_history) execute 'cat gpperfmon/data/diskspace_now.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: diskpace_tail
+--   (like diskspace_history)
+create external web table public.diskspace_tail (like public.diskspace_history) execute 'cat gpperfmon/data/diskspace_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: _diskspace_tail
+--   (like diskspace_history)
+create external web table public._diskspace_tail (like public.diskspace_history) execute 'cat gpperfmon/data/_diskspace_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+-- TABLE: network_interface_history -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+-- ctime timestamptz(0) not null, 
+-- hostname varchar(64) not null, 
+-- interface_name varchar(64) not null,
+-- bytes_received bigint, 
+-- packets_received bigint,
+-- receive_errors bigint,
+-- receive_drops bigint,
+-- receive_fifo_errors bigint,
+-- receive_frame_errors bigint,
+-- receive_compressed_packets int,
+-- receive_multicast_packets int,
+-- bytes_transmitted bigint,
+-- packets_transmitted bigint,
+-- transmit_errors bigint,
+-- transmit_drops bigint,
+-- transmit_fifo_errors bigint,
+-- transmit_collision_errors bigint,
+-- transmit_carrier_errors bigint,
+-- transmit_compressed_packets int
+create table public.network_interface_history ( ctime timestamptz(0) not null, hostname varchar(64) not null, interface_name varchar(64) not null, bytes_received bigint, packets_received bigint, receive_errors bigint, receive_drops bigint, receive_fifo_errors bigint, receive_frame_errors bigint, receive_compressed_packets int, receive_multicast_packets int, bytes_transmitted bigint, packets_transmitted bigint, transmit_errors bigint, transmit_drops bigint, transmit_fifo_errors bigint, transmit_collision_errors bigint, transmit_carrier_errors bigint, transmit_compressed_packets int) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+
+--- TABLE: network_interface_now
+--   (like network_interface_history)
+create external web table public.network_interface_now (like public.network_interface_history) execute 'cat gpperfmon/data/network_interface_now.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: network_interface_tail
+--   (like network_interface_history)
+create external web table public.network_interface_tail (like public.network_interface_history) execute 'cat gpperfmon/data/network_interface_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: _network_interface_tail
+--   (like network_interface_history)
+create external web table public._network_interface_tail (like public.network_interface_history) execute 'cat gpperfmon/data/_network_interface_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+
+-- TABLE: sockethistory --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+-- ctime timestamptz(0) not null, 
+-- hostname varchar(64) not null, 
+-- total_sockets_used int,
+-- tcp_sockets_inuse int,
+-- tcp_sockets_orphan int,
+-- tcp_sockets_timewait int,
+-- tcp_sockets_alloc int,
+-- tcp_sockets_memusage_inbytes int,
+-- udp_sockets_inuse int,
+-- udp_sockets_memusage_inbytes int,
+-- raw_sockets_inuse int,
+-- frag_sockets_inuse int,
+-- frag_sockets_memusage_inbytes int
+
+create table public.socket_history ( ctime timestamptz(0) not null, hostname varchar(64) not null, total_sockets_used int, tcp_sockets_inuse int, tcp_sockets_orphan int, tcp_sockets_timewait int, tcp_sockets_alloc int, tcp_sockets_memusage_inbytes int, udp_sockets_inuse int, udp_sockets_memusage_inbytes int, raw_sockets_inuse int, frag_sockets_inuse int, frag_sockets_memusage_inbytes int) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month')); 
+
+--- TABLE: socket_now
+--   (like socket_history)
+create external web table public.socket_now (like public.socket_history) execute 'cat gpperfmon/data/socket_now.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: socket_tail
+--   (like socket_history)
+create external web table public.socket_tail (like public.socket_history) execute 'cat gpperfmon/data/socket_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: _socket_tail
+--   (like socket_history)
+create external web table public._socket_tail (like public.socket_history) execute 'cat gpperfmon/data/_socket_tail.dat 2> /dev/null || true' on master format 'text' (delimiter '|' NULL as 'null');
+
+-- TABLE: gp_log_master_ext 
+--   (like gp_toolkit.__gp_log_master_ext)
+CREATE EXTERNAL WEB TABLE public.gp_log_master_ext (LIKE gp_toolkit.__gp_log_master_ext) EXECUTE E'find $GP_SEG_DATADIR/pg_log/ -name "gpdb*.csv" | sort -r | head -n 2 | xargs cat' ON MASTER FORMAT 'csv' (delimiter E',' null E'' escape E'"' quote E'"') ENCODING 'UTF8';
+
+-- TABLE: log_alert_history 
+--   (like gp_toolkit.__gp_log_master_ext)
+CREATE TABLE public.log_alert_history (LIKE gp_toolkit.__gp_log_master_ext) distributed by (logtime) partition by range (logtime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+
+-- TABLE: log_alert_tail 
+--   (like gp_toolkit.__gp_log_master_ext)
+CREATE EXTERNAL WEB TABLE public.log_alert_tail (LIKE public.log_alert_history) EXECUTE 'cat gpperfmon/logs/alert_log_stage 2> /dev/null || true' ON MASTER FORMAT 'csv' (delimiter E',' null E'' escape E'"' quote E'"') ENCODING 'UTF8'; 
+
+-- TABLE: log_alert_all 
+--   (like gp_toolkit.__gp_log_master_ext)
+CREATE EXTERNAL WEB TABLE public.log_alert_now (LIKE public.log_alert_history) EXECUTE 'cat gpperfmon/logs/*.csv 2> /dev/null || true' ON MASTER FORMAT 'csv' (delimiter E',' null E'' escape E'"' quote E'"') ENCODING 'UTF8'; 
+
+-- schema changes for gpperfmon needed to complete the creation of the schema
+
+revoke all on database gpperfmon from public;
+
+-- for web ui auth everyone needs connect permissions
+grant connect on database gpperfmon to public;
+-- END
diff --git a/contrib/perfmon/gpperfmon_install b/contrib/perfmon/gpperfmon_install
new file mode 100755
index 00000000000..27b8c935d93
--- /dev/null
+++ b/contrib/perfmon/gpperfmon_install
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+
+'''
+USAGE:   gpperfmon_install --port GPDB_PORT [--enable --password GPMON_PASSWORD] [--pgpass PATH_TO_FILE] [--gpperfmonport GPPERFMON_PORT] [--verbose]
+
+         where this script will install the gpperfmon database and schema
+
+         --enable option will also do the following tasks as a convenience:
+              1) create a gpmon super user
+              2) add a line to pg_hba.conf to allow access from master host for user gpmon
+              3) add a line to pg pass file
+              4) set gucs to enable gpperfmon
+
+         when using --enable, --password must be specified
+
+        --password will set the password for gpmon superuser
+        --port is the port used by gpperfmon to connect to GPDB
+        --pgpass is an option to allow overriding default path of $HOME/.pgpass
+        --gpperfmonport sets the guc 'gpperfmon_port' for gpperfmon communication (default is 8888)
+        --verbose will show output from sub-commands
+'''
+
+import os, sys, time, re
+from subprocess import Popen
+
+try:
+    from optparse import Option, OptionParser
+    from gppylib.gpparseopts import OptParser, OptChecker
+    from gppylib.userinput import ask_input
+    from gppylib.gplog import get_default_logger, setup_tool_logging
+    from gppylib.commands.unix import getLocalHostname, getUserName
+except ImportError as e:
+    sys.exit('Cannot import modules.  Please check that you have sourced greenplum_path.sh.  Detail: ' + str(e))
+
+EXECNAME = os.path.split(__file__)[-1]
+
+class Command:
+    def __init__(self, cmdstr, showOutput=False):
+        self.cmd = cmdstr
+        self.verbose = showOutput
+
+def run_command(cmd, verbose=False):
+
+    cmdstr = cmd.cmd
+
+    if not options.verbose and not cmd.verbose:
+        cmdstr = "%s >& /dev/null" % cmdstr
+    # hide password
+    if bool(re.search('CREATE ROLE.*ENCRYPTED PASSWORD',cmdstr)):
+        regex = re.compile('ENCRYPTED\sPASSWORD\s\'(.*)\'')
+        logger.info(regex.sub('ENCRYPTED PASSWORD \'********\'',cmdstr))
+    elif bool(re.search('echo.*:gpperfmon:gpmon:',cmdstr)):
+        regex = re.compile(':gpperfmon:gpmon:(.*)\"')
+        logger.info(regex.sub(':gpperfmon:gpmon:********',cmdstr))
+    else:
+        logger.info(cmdstr)
+    p = Popen(cmdstr, shell=True, executable="/bin/bash")
+    sts = os.waitpid(p.pid, 0)[1]
+    if sts:
+        raise Exception("error on cmd " + cmdstr)
+
+def validate_password(input, ignore):
+    if len(input) < 1:
+        return None
+    else:
+        return input
+
+
+def cli_help():
+    help_path = os.path.join(sys.path[0], '..', 'docs', 'cli_help', EXECNAME + '_help')
+    f = None
+    try:
+        try:
+            f = open(help_path);
+            return f.read(-1)
+        except:
+            return ''
+    finally:
+        if f: f.close()
+
+
+def usage():
+    print (cli_help() or __doc__)
+
+
+###### main()
+if __name__ == '__main__':
+
+    logger = get_default_logger()
+    setup_tool_logging(EXECNAME,getLocalHostname(),getUserName())
+
+    gphome = os.environ.get('GPHOME')
+    if not gphome:
+        logger.error("GPHOME not set")
+        sys.exit(1)
+    if not os.path.isfile('/bin/bash'):
+        logger.error("can not find /bin/bash")
+        sys.exit(1)
+
+    parser = OptParser(option_class=OptChecker)
+    parser.remove_option('-h')
+    parser.add_option('-h', '-?', '--help', action='store_true')
+    parser.add_option('-e', '--enable', action='store_true')
+    parser.add_option('-v', '--verbose', action='store_true')
+    parser.add_option('-p', '--password', type='string')
+    parser.add_option('-P', '--port', type='int')
+    parser.add_option('-r', '--gpperfmonport', type='int', default=8888)
+    parser.add_option('--pgpass', type='string')
+    (options, args) = parser.parse_args()
+
+    if options.help:
+        usage()
+        sys.exit(1)
+
+    if not options.port:
+        logger.error("--port must be specified")
+        sys.exit(1)
+
+    if options.enable and (not options.password):
+        logger.error("when enabling gpperfmon --password must be specified")
+        sys.exit(1)
+
+    if not options.enable and options.password:
+        logger.error ("--password is only used when enabling gpperfmon")
+        sys.exit(1)
+
+    commands = list()
+
+    cmd = Command("createdb gpperfmon")
+    commands.append(cmd)
+
+    cmd = Command("PGPORT=%d psql -f %s/lib/gpperfmon/gpperfmon.sql gpperfmon" % (options.port, gphome))
+    commands.append(cmd)
+
+    if options.enable:
+
+        coordinatordata_dir = os.getenv('COORDINATOR_DATA_DIRECTORY')
+        pg_hba = "%s/pg_hba.conf" % coordinatordata_dir
+        home_dir = os.getenv('HOME')
+        gpperfmon_conf_dir = "%s/gpperfmon/conf" % coordinatordata_dir
+        gpperfmon_conf_file = "%s/gpperfmon.conf" % gpperfmon_conf_dir
+        gpperfmon_conf_file_src = "%s/lib/gpperfmon/gpperfmon.conf" % gphome
+
+        if not coordinatordata_dir:
+            logger.error("COORDINATOR_DATA_DIRECTORY must be set")
+            sys.exit(1)
+
+        if not home_dir:
+            logger.error("$HOME must be set")
+            sys.exit(1)
+
+        if options.pgpass:
+            pg_pass = options.pgpass
+        else:
+            pg_pass = "%s/.pgpass" % home_dir
+
+        old_pg_pass = "%s.%d" % (pg_pass, time.time())
+
+        if not os.path.isfile( pg_hba ):
+            logger.error("can not find pg_hba.conf at %s" % pg_hba)
+            sys.exit(1)
+
+        if not os.path.isdir(home_dir):
+            logger.error("can not find $HOME")
+            sys.exit(1)
+
+        #if os.path.isfile(gpperfmon_conf_file):
+        #    logger.error(" gpperfmon.conf already exists %s" % gpperfmon_conf_file) 
+        #    sys.exit(1)
+
+        if not os.path.isfile(gpperfmon_conf_file_src):
+            logger.error(" gpperfmon.conf doesn't exist in %s/lib/gpperfmon" % gphome) 
+            sys.exit(1)
+
+        cmd = Command("""PGPORT=%d psql template1 -c "DROP ROLE IF EXISTS gpmon" """ % options.port)
+        commands.append(cmd)
+
+        cmd = Command("""PGPORT=%d psql template1 -c "CREATE ROLE gpmon WITH SUPERUSER CREATEDB LOGIN ENCRYPTED PASSWORD '%s'" """ % (options.port, options.password))
+        commands.append(cmd)
+
+        cmd = Command("""echo "local    gpperfmon         gpmon         md5" >> %s""" % pg_hba, showOutput=True)
+        commands.append(cmd)
+
+        cmd = Command("""echo "host     all         gpmon         127.0.0.1/28    md5" >> %s""" % pg_hba, showOutput=True)
+        commands.append(cmd)
+
+        cmd = Command("""echo "host     all         gpmon         ::1/128    md5" >> %s""" % pg_hba, showOutput=True)
+        commands.append(cmd)
+
+        ################################################
+        # these commands add a new line to the top of .pgpass and save a copy of old .pgpass
+        cmd = Command("""touch %s""" % (pg_pass))
+        commands.append(cmd)
+
+        cmd = Command("""mv -f %s %s""" % (pg_pass, old_pg_pass))
+        commands.append(cmd)
+
+        cmd = Command("""echo "*:%d:gpperfmon:gpmon:%s" >> %s""" % (options.port, options.password, pg_pass), showOutput=True)
+        commands.append(cmd)
+
+        cmd = Command("""cat %s >> %s""" % (old_pg_pass, pg_pass), showOutput=True)
+        commands.append(cmd)
+
+        cmd = Command("""chmod 0600 %s""" % pg_pass)
+        commands.append(cmd)
+        ################################################
+        # copy the gpperfmon.conf file
+
+        if not os.path.isdir(gpperfmon_conf_dir):
+            cmd = Command("""mkdir -p %s""" % gpperfmon_conf_dir, showOutput=True)
+            commands.append(cmd)
+
+        cmd = Command("""cp %s %s""" % (gpperfmon_conf_file_src, gpperfmon_conf_dir), showOutput=True)
+        commands.append(cmd)
+
+        ################################################
+        cmd = Command("PGPORT=%d gpconfig -c perfmon.enable -v on" % (options.port))
+        commands.append(cmd)
+
+        cmd = Command("PGPORT=%d gpconfig -c perfmon.port -v %d" % (options.port, options.gpperfmonport))
+        commands.append(cmd)
+
+        cmd = Command("PGPORT=%d gpconfig -c gp_external_enable_exec -v on --masteronly" % (options.port))
+        commands.append(cmd)
+
+        #cmd = Command("PGPORT=%d gpconfig -c gpperfmon_log_alert_level -v warning" % (options.port))
+        #commands.append(cmd)
+
+    command = None
+    try:
+        for c in (commands):
+            command = c
+            run_command(command)
+    except:
+        logger.error("error on command: %s" % command.cmd)
+        logger.error("gpperfmon not successfully installed")
+        sys.exit(1)
+
+    if options.enable:
+        logger.info("gpperfmon will be enabled after a full restart of cloudberrydb")
+    else:
+        logger.info("gpperfmon schema successfully installed")
diff --git a/contrib/perfmon/gpperfmoncat.sh b/contrib/perfmon/gpperfmoncat.sh
new file mode 100755
index 00000000000..4d4f9b4bf11
--- /dev/null
+++ b/contrib/perfmon/gpperfmoncat.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+iconv_encodings=(
+    ""  # SQL_ASCII not supported as server encoding.
+    "EUC-JP"
+    "EUC-CN"
+    "EUC-KR"
+    "EUC-TW"
+    "EUC-JISX0213"
+    "UTF8"
+    ""  # MULE_INTERNAL not supported in iconv.
+    "LATIN1"
+    "LATIN2"
+    "LATIN3"
+    "LATIN4"
+    "LATIN5"
+    "LATIN6"
+    "LATIN7"
+    "LATIN8"
+    "LATIN9"
+    "LATIN10"
+    "WINDOWS-1256"
+    "WINDOWS-1258"
+    ""  # WIN866 not supported in iconv.
+    "WINDOWS-874"
+    "KOI8-R"
+    "WINDOWS-1251"
+    "WINDOWS-1252"
+    "ISO_8859-5"
+    "ISO_8859-6"
+    "ISO_8859-7"
+    "ISO_8859-8"
+    "WINDOWS-1250"
+    "WINDOWS-1253"
+    "WINDOWS-1254"
+    "WINDOWS-1255"
+    "WINDOWS-1257"
+    "KOI8-U"
+    "SJIS"
+    ""  # BIG5 not supported in server encoding.
+    ""  # GBK not supported in server encoding.
+    ""  # UHC not supported in server encoding.
+    ""  # GB18030 not supported in server encoding.
+    "JOHAB"
+    "" # SJIS not supported in server encoding.
+)
+server_encoding=`(psql -p $GP_MASTER_PORT -d postgres -c "select encoding from
+pg_catalog.pg_database d where d.datname = 'gpperfmon'" | tail -n3 | head -n1) 2> /dev/null || true`
+iconv_encoding=${iconv_encodings[${server_encoding}]}
+iconv -f $iconv_encoding -t $iconv_encoding -c $*
diff --git a/contrib/perfmon/sql/guc_config.sql b/contrib/perfmon/sql/guc_config.sql
new file mode 100644
index 00000000000..40d3a5bfd02
--- /dev/null
+++ b/contrib/perfmon/sql/guc_config.sql
@@ -0,0 +1,43 @@
+-- Disable perfmon.enable
+-- start_ignore
+\! gpconfig -c perfmon.enable -v false
+\! gpstop -ari
+-- end_ignore
+\! ps -ef | grep '\[gpmmon\]' | wc -l
+\c gpperfmon
+show perfmon.enable;
+
+-- start_ignore
+\! gpconfig -c perfmon.enable -v true 
+\! gpconfig -c perfmon.port -v 8848
+\! gpstop -ari
+-- end_ignore
+\! ps -ef | grep '\[gpmmon\]' | wc -l
+\c gpperfmon
+show perfmon.enable;
+show perfmon.port;
+CREATE OR REPLACE FUNCTION wait_for_gpsmon_work() RETURNS void AS $$
+DECLARE
+start_time timestamptz := clock_timestamp();
+updated bool;
+starttime timestamptz;
+BEGIN
+	select COALESCE(ctime,CURRENT_TIMESTAMP) from diskspace_now into starttime;
+	-- we don't want to wait forever; loop will exit after 60 seconds
+	FOR i IN 1 .. 600 LOOP
+		SELECT(SELECT count(*) > 0 from diskspace_now 
+			WHERE ctime > starttime) INTO updated;
+		EXIT WHEN updated;
+
+		-- wait a little
+		PERFORM pg_sleep_for('100 milliseconds');
+	END LOOP;
+	-- report time waited in postmaster log (where it won't change test output)
+	RAISE log 'wait_for_gpsmon_work delayed % seconds',
+	EXTRACT(epoch FROM clock_timestamp() - start_time);
+END
+$$ LANGUAGE plpgsql;
+select wait_for_gpsmon_work(); 
+select count(*) from diskspace_now;
+\! netstat -anp | grep udp | grep gpsmon | wc -l
+\! ps -ef | grep gpsmon | grep -v grep | wc -l
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
new file mode 100644
index 00000000000..537a56fd01d
--- /dev/null
+++ b/contrib/perfmon/sql/query.sql
@@ -0,0 +1,9 @@
+select sess_id from pg_stat_activity where pg_backend_pid()=pid;
+\gset
+create table test(a int);
+select * from test;
+select pg_sleep(18);
+\c gpperfmon
+select ssid, pid, ccnt, status, query_text from queries_now where ssid = :sess_id;
+\c contrib_regression
+drop table test;
diff --git a/contrib/perfmon/src/common/gpmonlib.c b/contrib/perfmon/src/common/gpmonlib.c
new file mode 100644
index 00000000000..7b23798e7af
--- /dev/null
+++ b/contrib/perfmon/src/common/gpmonlib.c
@@ -0,0 +1,559 @@
+#undef GP_VERSION
+#include "postgres_fe.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include "gpmonlib.h"
+#include "apr_queue.h"
+#include "apr_atomic.h"
+#include "apr_lib.h"
+#include "assert.h"
+#include "time.h"
+
+#if APR_IS_BIGENDIAN
+#define local_htonll(n)  (n)
+#define local_ntohll(n)  (n)
+#else
+#define local_htonll(n)  ((((apr_uint64_t) htonl(n)) << 32LL) | htonl((n) >> 32LL))
+#define local_ntohll(n)  ((((apr_uint64_t) ntohl(n)) << 32LL) | (apr_uint32_t) ntohl(((apr_uint64_t)n) >> 32LL))
+#endif
+
+extern apr_thread_mutex_t *logfile_mutex;
+
+#define LOCK_STDOUT if (logfile_mutex) { apr_thread_mutex_lock(logfile_mutex); }
+#define UNLOCK_STDOUT if (logfile_mutex) { apr_thread_mutex_unlock(logfile_mutex); }
+#define META_LEN 100
+#define READ_BUF_SIZE 100
+
+inline void gp_smon_to_mmon_set_header(gp_smon_to_mmon_packet_t* pkt, apr_int16_t pkttype)
+{
+	pkt->header.pkttype = pkttype;
+	pkt->header.magic = GPMON_MAGIC;
+	pkt->header.version = GPMON_PACKET_VERSION;
+	return;
+}
+
+/*Helper function to get the size of the union packet*/
+inline size_t get_size_by_pkttype_smon_to_mmon(apr_int16_t pkttype)
+{
+	switch (pkttype) {
+		case GPMON_PKTTYPE_HELLO:
+			return(sizeof(gpmon_hello_t));
+		case GPMON_PKTTYPE_METRICS:
+			return(sizeof(gpmon_metrics_t));
+		case GPMON_PKTTYPE_QLOG:
+			return(sizeof(gpmon_qlog_t));
+		case GPMON_PKTTYPE_QEXEC:
+			return(sizeof(qexec_packet_t));
+		case GPMON_PKTTYPE_SEGINFO:
+			return(sizeof(gpmon_seginfo_t));
+		case GPMON_PKTTYPE_QUERY_HOST_METRICS:
+			return(sizeof(gpmon_qlog_t));
+		case GPMON_PKTTYPE_FSINFO:
+			return(sizeof(gpmon_fsinfo_t));
+		case GPMON_PKTTYPE_QUERYSEG:
+			return(sizeof(gpmon_query_seginfo_t));
+		default:
+			gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "not supported package type");
+	}
+
+	return 0;
+}
+
+apr_status_t gpmon_ntohpkt(apr_int32_t magic, apr_int16_t version, apr_int16_t pkttype)
+{
+	static apr_int64_t last_err_sec = 0;
+
+	if (magic != GPMON_MAGIC)
+	{
+		apr_int64_t now = time(NULL);
+		if (now - last_err_sec >= GPMON_PACKET_ERR_LOG_TIME)
+		{
+			last_err_sec = now;
+			gpmon_warning(FLINE, "bad packet (magic number mismatch)");
+		}
+		return APR_EINVAL;
+	}
+
+	if (version != GPMON_PACKET_VERSION)
+	{
+		apr_int64_t now = time(NULL);
+		if (now - last_err_sec >= GPMON_PACKET_ERR_LOG_TIME)
+		{
+			last_err_sec = now;
+			gpmon_warning(FLINE, "bad packet (version %d, expected %d)", version, GPMON_PACKET_VERSION);
+		}
+		return APR_EINVAL;
+	}
+
+    if (! (GPMON_PKTTYPE_NONE < pkttype && pkttype < GPMON_PKTTYPE_MAX))
+	{
+		apr_int64_t now = time(NULL);
+		if (now - last_err_sec >= GPMON_PACKET_ERR_LOG_TIME)
+		{
+			last_err_sec = now;
+			gpmon_warning(FLINE, "bad packet (unexpected packet type %d)", pkttype);
+		}
+		return APR_EINVAL;
+	}
+
+	return 0;
+}
+
+
+#define GPMONLIB_DATETIME_BUFSIZE_LOCAL 100
+static char* datetime(void)
+{
+	static char buf[GPMONLIB_DATETIME_BUFSIZE_LOCAL];
+	time_t now;
+	now = time(NULL);
+	return gpmon_datetime(now, buf);
+}
+
+
+
+int gpmon_print(const char* fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+
+	LOCK_STDOUT
+
+    fprintf(stdout, "%s|:-LOG: ", datetime());
+    vfprintf(stdout, fmt, ap);
+    fflush(stdout);
+
+	UNLOCK_STDOUT
+
+    return 0;
+}
+
+
+int gpmon_fatal(const char* fline, const char* fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+
+	LOCK_STDOUT
+	fprintf(stdout, "%s|:-FATAL: [INTERNAL ERROR %s] ", datetime(), fline);
+	vfprintf(stdout, fmt, ap);
+	fprintf(stdout, "\n          ... exiting\n");
+	UNLOCK_STDOUT
+
+	fprintf(stderr, "%s|:-FATAL: [INTERNAL ERROR %s] ", datetime(), fline);
+	vfprintf(stderr, fmt, ap);
+	fprintf(stderr, "\n          ... exiting\n");
+
+	exit(1);
+	return 0;
+}
+
+int gpsmon_fatal(const char* fline, const char* fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	fprintf(stdout, "%s|:-FATAL: [INTERNAL ERROR %s] ", datetime(), fline);
+	vfprintf(stdout, fmt, ap);
+	fprintf(stdout, "\n          ... exiting\n");
+	exit(1);
+	return 0;
+}
+
+
+
+int gpmon_fatalx(const char* fline, int e, const char* fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+
+	LOCK_STDOUT
+	fprintf(stdout, "%s|:-FATAL: [INTERNAL ERROR %s] ", datetime(), fline);
+	vfprintf(stdout, fmt, ap);
+	if (e)
+	{
+		char msg[512];
+		fprintf(stdout, "\n\terror %d (%s)", e, apr_strerror(e, msg, sizeof(msg)));
+	}
+	fprintf(stdout, "\n\t... exiting\n");
+	UNLOCK_STDOUT
+
+	fprintf(stderr, "%s|:-FATAL: [INTERNAL ERROR %s] ", datetime(), fline);
+	vfprintf(stderr, fmt, ap);
+	if (e)
+	{
+		char msg[512];
+		fprintf(stderr, "\n\terror %d (%s)", e, apr_strerror(e, msg, sizeof(msg)));
+	}
+	fprintf(stderr, "\n\t... exiting\n");
+
+	exit(1);
+	return 0;
+}
+
+
+int gpsmon_fatalx(const char* fline, int e, const char* fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	fprintf(stdout, "%s|:-FATAL: [INTERNAL ERROR %s] ", datetime(), fline);
+	vfprintf(stdout, fmt, ap);
+	if (e)
+	{
+		char msg[512];
+		fprintf(stdout, "\n\terror %d (%s)", e, apr_strerror(e, msg, sizeof(msg)));
+	}
+	fprintf(stdout, "\n\t... exiting\n");
+	exit(1);
+	return 0;
+}
+
+
+
+
+int gpmon_warning(const char* fline, const char* fmt, ...)
+{
+    va_list ap;
+    va_start(ap, fmt);
+
+	LOCK_STDOUT
+    fprintf(stdout, "%s|:-WARNING: [%s] ", datetime(), fline);
+    vfprintf(stdout, fmt, ap);
+    fprintf(stdout, "\n");
+	UNLOCK_STDOUT
+
+    return 0;
+}
+
+int gpmon_warningx(const char* fline, int e, const char* fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+
+	LOCK_STDOUT
+	fprintf(stdout, "%s|:-WARNING: [%s] ", datetime(), fline);
+	vfprintf(stdout, fmt, ap);
+	if (e) {
+		char msg[512];
+		fprintf(stdout, "\n\terror %d (%s)", e, apr_strerror(e, msg, sizeof(msg)));
+	}
+	fprintf(stdout, "\n");
+	UNLOCK_STDOUT
+
+	return 0;
+}
+
+void gpmon_print_file(const char* header_line, FILE* fp)
+{
+	char buffer[READ_BUF_SIZE];
+	int len;
+
+	LOCK_STDOUT
+	fprintf(stdout, "%s|:-WARNING: [%s] \n", datetime(), header_line);
+	while ((len = fread(buffer, 1, sizeof(buffer) - 1, fp)) != 0)
+	{
+		buffer[len] = 0;
+		fprintf(stdout, "%s", buffer);
+	}
+	fprintf(stdout, "\n");
+
+	UNLOCK_STDOUT
+}
+
+const char* gpmon_qlog_status_string(int gpmon_qlog_status)
+{
+    switch (gpmon_qlog_status) {
+    case GPMON_QLOG_STATUS_SILENT: return "silent";
+    case GPMON_QLOG_STATUS_SUBMIT: return "submit";
+    case GPMON_QLOG_STATUS_START: return "start";
+    case GPMON_QLOG_STATUS_DONE: return "done";
+    case GPMON_QLOG_STATUS_ERROR: return "abort";
+    case GPMON_QLOG_STATUS_CANCELING: return "canceling";
+    }
+    return "unknown";
+}
+
+
+/* remove whitespaces from front & back of s */
+char* gpmon_trim(char* s)
+{
+    char* p = s;
+    char* q = s + strlen(s);
+    for ( ; p < q && apr_isspace(*p); p++);
+    for ( ; p < q && apr_isspace(q[-1]); q--);
+    *q = 0;
+    return p;
+}
+
+
+/* datetime, e.g. 2004-02-14  23:50:02 */
+char* gpmon_datetime(time_t t, char str[GPMON_DATE_BUF_SIZE])
+{
+	struct tm tm =  { 0 };
+
+	str[0] = 0;
+
+	if (!localtime_r(&t, &tm))
+	{
+		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "localtime_r failed");
+		return str;
+	}
+
+	snprintf(str, GPMON_DATE_BUF_SIZE, "%04d-%02d-%02d %02d:%02d:%02d",
+	    1900 + tm.tm_year, tm.tm_mon + 1, tm.tm_mday,
+	    tm.tm_hour, tm.tm_min, tm.tm_sec);
+
+    	return str;
+}
+
+/* datetime, e.g. 2004-02-14  23:50:10
+   round to lowest 5 second interval */
+char* gpmon_datetime_rounded(time_t t, char str[GPMON_DATE_BUF_SIZE])
+{
+	struct tm tm =  { 0 };
+
+	str[0] = 0;
+
+	if (!localtime_r(&t, &tm))
+	{
+		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "localtime_r failed");
+		return str;
+	}
+
+	snprintf(str, GPMON_DATE_BUF_SIZE, "%04d-%02d-%02d %02d:%02d:%02d",
+	1900 + tm.tm_year, tm.tm_mon + 1, tm.tm_mday,
+		tm.tm_hour, tm.tm_min, ((tm.tm_sec/5)*5));
+
+	return str;
+}
+
+/* get status from query text file */
+apr_int32_t get_query_status(apr_int32_t tmid, apr_int32_t ssid,
+							 apr_int32_t ccnt)
+{
+	char fname[GPMON_DIR_MAX_PATH];
+	FILE *fp;
+	apr_int32_t status = GPMON_QLOG_STATUS_INVALID;
+
+	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, tmid, ssid, ccnt);
+
+	fp = fopen(fname, "r");
+	if (!fp)
+		return GPMON_QLOG_STATUS_INVALID;
+
+	if (0 != fseek(fp, -1, SEEK_END))
+	{
+		fclose(fp);
+		return GPMON_QLOG_STATUS_INVALID;
+	}
+	fscanf(fp, "%d", &status);
+	fclose(fp);
+	return status;
+}
+
+/* get query text from query text file */
+char *get_query_text(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_pool_t *pool)
+{
+	char meta[META_LEN] = {0};
+	signed int qrylen = 0;
+	char fname[GPMON_DIR_MAX_PATH] = {0};
+	const char *META_FMT = "%d qtext";
+	const char *META_QTEXT = "qtext\n";
+
+	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, tmid, ssid, ccnt);
+
+	FILE *fp = fopen(fname, "r");
+	if (!fp)
+	{
+		TR0(("Warning: Open file %s failed\n", fname));
+		return NULL;
+	}
+
+	// get query text length
+	char *readPtr = fgets(meta, META_LEN, fp);
+	int metaLen = strlen(meta);
+	if (readPtr != meta || metaLen <= strlen(META_QTEXT) || 0 != strcmp(META_QTEXT, meta + metaLen - strlen(META_QTEXT)))
+	{
+		fclose(fp);
+		TR0(("Warning: Invalid format in file '%s'\n", fname));
+		return NULL;
+	}
+
+	int count = sscanf(meta, META_FMT, &qrylen);
+	if (count != 1 || qrylen < 0)
+	{
+		fclose(fp);
+		TR0(("Warning: Invalid format in file '%s', line 1: '%s'\n", fname, meta));
+		return NULL;
+	}
+
+	if (qrylen > MAX_QUERY_COMPARE_LENGTH)
+	{
+		TR0(("Warning: Query length is very big %d\n", qrylen));
+		qrylen = MAX_QUERY_COMPARE_LENGTH;
+	}
+
+	char *query = apr_palloc(pool, qrylen + 1);
+	if (query == NULL)
+	{
+		fclose(fp);
+		TR0(("Warning: Out of memory when allocating query text\n"));
+		return NULL;
+	}
+
+	// read query text
+	int readlen = fread(query, 1, qrylen, fp);
+	if (readlen != qrylen)
+	{
+		fclose(fp);
+		TR0(("Warning: Failed to read query text in file: '%s', query text length %d, read length %d.\n", fname, qrylen, readlen));
+		return NULL;
+	}
+	query[readlen] = '\0';
+
+	fclose(fp);
+
+	return query;
+}
+
+int gpmon_recursive_mkdir(char* work_dir)
+{
+    char *pdir = work_dir;
+    while (*pdir)
+    {
+        if (*pdir == '/' && (pdir != work_dir))
+        {
+            *pdir = 0;
+            if (-1 == mkdir(work_dir, 0700) && EEXIST != errno)
+            {
+				fprintf(stderr, "Performance Monitor - mkdir '%s' failed", work_dir);
+				perror("Performance Monitor -");
+                return 1;
+            }
+            *pdir = '/';
+        }
+        pdir++;
+    }
+
+    if (-1 == mkdir(work_dir, 0700) && EEXIST != errno)
+    {
+		fprintf(stderr, "Performance Monitor - mkdir '%s' failed", work_dir);
+		perror("Performance Monitor -");
+        return 1;
+    }
+
+    return 0;
+}
+
+
+/*
+ * Create apr_pool_t with parent as well as a new allocator which belongs to
+ * itself so that when calling apr_pool_destroy, the free memory inside this
+ * pool will be returned to OS. (MPP-23751)
+ */
+apr_status_t apr_pool_create_alloc(apr_pool_t ** newpool, apr_pool_t *parent)
+{
+	apr_status_t rv;
+	apr_allocator_t *allocator;
+	if ((rv = apr_allocator_create(&allocator)) != APR_SUCCESS)
+	{
+		return rv;
+	}
+	if ((rv = apr_pool_create_ex(newpool, parent, NULL, allocator)) != APR_SUCCESS)
+	{
+		apr_allocator_destroy(allocator);
+		return rv;
+	}
+	// This function is only for internal use, so newpool can't be NULL.
+	apr_allocator_owner_set(allocator, *newpool);
+
+	return APR_SUCCESS;
+}
+
+void advance_connection_hostname(host_t* host)
+{
+	// for connections we should only be connecting 1 time
+	// if the smon fails we may have to reconnect but this event is rare
+	// we try 3 times on each hostname and then switch to another
+	host->connection_hostname.counter++;
+
+	if (host->connection_hostname.counter > 3)
+	{
+		if (host->connection_hostname.current->next)
+		{
+			// try the next hostname
+			host->connection_hostname.current = host->connection_hostname.current->next;
+		}
+		else
+		{
+			// restart at the head of address list
+			host->connection_hostname.current = host->addressinfo_head;
+		}
+		host->connection_hostname.counter = 1;
+	}
+}
+
+char* get_connection_hostname(host_t* host)
+{
+	return host->connection_hostname.current->address;
+}
+
+char* get_connection_ip(host_t* host)
+{
+	return host->connection_hostname.current->ipstr;
+}
+
+bool get_connection_ipv6_status(host_t* host)
+{
+	return host->connection_hostname.current->ipv6;
+}
+
+double subtractTimeOfDay(struct timeval* begin, struct timeval* end)
+{
+    double seconds;
+
+    if (end->tv_usec < begin->tv_usec)
+    {
+        end->tv_usec += 1000000;
+        end->tv_sec -= 1;
+    }
+
+    seconds = end->tv_usec - begin->tv_usec;
+    seconds /= 1000000.0;
+
+    seconds += (end->tv_sec - begin->tv_sec);
+    return seconds;
+}
+
+/**
+ * Merge the qlogs with the same key
+ */
+void merge_qlog(gpmon_qlog_t* qlog, const gpmon_qlog_t* newqlog)
+{
+		Assert(qlog);
+		switch (newqlog->status)
+		{
+				case GPMON_QLOG_STATUS_SUBMIT:
+						TR0(("qlog status WARNNING: duplicated query log key %d-%d-%d\n", newqlog->key.tmid, newqlog->key.ssid, newqlog->key.ccnt));
+						break;
+				case GPMON_QLOG_STATUS_START:
+						qlog->status = newqlog->status;
+						qlog->tstart = newqlog->tstart;
+						break;
+				case GPMON_QLOG_STATUS_DONE:
+						qlog->status = newqlog->status;
+						qlog->tfin = newqlog->tfin;
+						break;
+				case GPMON_QLOG_STATUS_CANCELING:
+						qlog->status = newqlog->status;
+						break;
+				case GPMON_QLOG_STATUS_ERROR:
+						qlog->status = newqlog->status;
+						qlog->tfin = newqlog->tfin;
+						break;
+				default:
+						return;
+		}
+}
diff --git a/contrib/perfmon/src/gpmmon/Makefile b/contrib/perfmon/src/gpmmon/Makefile
new file mode 100644
index 00000000000..3501f77a391
--- /dev/null
+++ b/contrib/perfmon/src/gpmmon/Makefile
@@ -0,0 +1,18 @@
+top_builddir = ../../../../
+
+MODULE_big = gpmmon
+OBJS = gpmmon.o gpmondb.o gpmon_agg.o  ../common/gpmonlib.o
+SHLIB_LINK += -levent  -lapr-1 -laprutil-1 -lm
+PG_CFLAGS += -Wno-error=vla -Wno-vla
+PG_CPPFLAGS = -I$(libpq_srcdir) -I../include -I/usr/include/apr-1
+SHLIB_LINK_INTERNAL = -Wl,-Bsymbolic -Wl,-Bstatic -Wl,-Bstatic $(libpq) -lpgcommon_shlib -Wl,-Bdynamic
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/perfmon/src/gpmmon
+include $(top_builddir)/src/Makefile.global
+include $(top_builddir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
new file mode 100644
index 00000000000..19c88379bc3
--- /dev/null
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -0,0 +1,1858 @@
+#include "postgres.h"
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include "apr_atomic.h"
+#include "apr_env.h"
+#include "apr_time.h"
+#include "apr_hash.h"
+#include "apr_getopt.h"
+#include "apr_lib.h"
+#include "apr_strings.h"
+#include "apr_queue.h"
+#include "apr_pools.h"
+#include "apr_tables.h"
+#include "apr_thread_proc.h"
+#include "apr_thread_mutex.h"
+#include "apr_md5.h"
+#include "cdb/cdbvars.h"
+#include "fmgr.h"
+#include "gpmonlib.h"
+#include "gpmon.h"
+#include "gpmon_agg.h"
+#include "gpmondb.h"
+#include <event.h>
+#include "libpq-fe.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/postmaster.h"
+#include "utils/guc.h"
+#include <time.h>
+
+PG_MODULE_MAGIC;
+
+void _PG_init(void);
+void _PG_fini(void);
+
+/* global guc variables */
+int perfmon_port = 8888;
+bool perfmon_enabled = false;
+//bool perfmon_enable_query_metric;
+
+void update_mmonlog_filename(void);
+int gpmmon_quantum(void);
+void incremement_tail_bytes(apr_uint64_t);
+time_t compute_next_dump_to_file(void);
+void populate_smdw_aliases(host_t*);
+char* get_ip_for_host(char*, bool*);
+
+#define YES_TEXT "yes"
+#define NO_TEXT "no"
+#define GPMMON_BINARY_NAME "gpmmon"
+
+#define SET_MAXFD(fd)  if (fd < opt.max_fd) ; else opt.max_fd = fd
+
+mmon_options_t opt = { 0 };
+
+static const apr_uint64_t smon_terminate_safe_factor = 10;
+static const apr_uint64_t recv_timeout_factor = 10;
+
+// If smon doesn't receive any request from mmon,
+// it simply kill itself to restart.
+static apr_uint64_t smon_terminate_timeout = 0;
+static apr_uint64_t recv_timeout = 0;
+int perfmon_main(Datum arg);
+
+/* gpmmon host */
+static struct
+{
+	int port; /* port of gpsmon */
+	apr_pool_t* pool; /* pool */
+	char* gphome; /* GPHOME env variable */
+	apr_int64_t signature; /* a large random number */
+
+	host_t* hosttab; /* multi-home filtered machines */
+	int hosttabsz; /* # machines */
+
+	char* master_data_directory;
+	char* standby_master_hostname;
+	apr_thread_mutex_t *agg_mutex; /* lock when accessing the agg data */
+	agg_t* agg;
+	apr_hash_t* fsinfotab; /* This is the persistent fsinfo hash table: key = gpmon_fsinfokey_t, value = mmon_fsinfo_t ptr */
+
+	apr_thread_mutex_t *tailfile_mutex; /* lock when accessing the physical tail files */
+
+	int exit; /* TRUE if we need to exit */
+	int reload;
+	apr_uint64_t tail_buffer_bytes;
+	apr_uint64_t _tail_buffer_bytes;
+} ax = { 0 };
+
+void interuptable_sleep(unsigned int seconds);
+
+// lock when accessing the debug log file
+apr_thread_mutex_t *logfile_mutex = NULL;
+
+
+/* Default option values */
+int verbose = 0; /* == opt.v */
+int very_verbose = 0; /* == opt.V */
+int quantum = 15; /* == opt.quantum */
+int min_query_time = 60; /* == opt.m */
+
+/* thread handles */
+static apr_thread_t* conm_th = NULL;
+static apr_thread_t* event_th = NULL;
+static apr_thread_t* harvest_th = NULL;
+static apr_thread_t* message_th = NULL;
+apr_queue_t* message_queue = NULL;
+
+/* signal masks */
+sigset_t unblocksig;
+sigset_t blocksig;
+
+extern int gpdb_exec_search_for_at_least_one_row(const char* QUERY, PGconn* persistant_conn);
+
+/* Function defs */
+static int read_conf_file(char *conffile);
+static void gethostlist(void);
+static void getconfig(void);
+static apr_status_t sendpkt(int sock, const gp_smon_to_mmon_packet_t* pkt);
+static apr_status_t recvpkt(int sock, gp_smon_to_mmon_packet_t* pkt, bool loop_until_all_recv);
+
+static void def_gucs(void);
+
+
+#define MMON_LOG_FILENAME_SIZE (MAXPATHLEN+1)
+char mmon_log_filename[MMON_LOG_FILENAME_SIZE];
+void update_mmonlog_filename()
+{
+		time_t stamp = time(NULL);
+		struct tm* tm = gmtime(&stamp);
+		snprintf(
+			mmon_log_filename,
+	 		MMON_LOG_FILENAME_SIZE,
+			"%s/gpmmon.%d.%02d.%02d_%02d%02d%02d.log",
+			opt.log_dir,
+			tm->tm_year + 1900,
+			tm->tm_mon + 1,
+			tm->tm_mday,
+			tm->tm_hour,
+			tm->tm_min,
+			tm->tm_sec);
+}
+
+/** Gets quantum */
+int gpmmon_quantum(void)
+{
+	return opt.quantum;
+}
+
+/* prints usage and exit */
+static void usage(const char* msg)
+{
+	fprintf(stderr, "\nusage: %s -D <configuration file> [options]\n\n",
+			opt.pname);
+	fprintf(stderr, "options:\n");
+	fprintf(stderr, "\t-?\t\t: print this help screen\n");
+	fprintf(stderr, "\t-V\t\t: print packet version information\n\n");
+	if (msg)
+		fprintf(stderr, "%s\n\n", msg);
+
+	exit(msg ? 1 : 0);
+}
+
+void incremement_tail_bytes(apr_uint64_t bytes)
+{
+	ax.tail_buffer_bytes += bytes;
+}
+
+/* Cleanup function called on exit. */
+static void cleanup()
+{
+	/* wait for threads, close ports, apr cleanup, etc. */
+	apr_status_t tstatus;
+	int i;
+
+	ax.exit = 1;
+
+	if (event_th)
+		apr_thread_join(&tstatus, event_th);
+	if (conm_th)
+		apr_thread_join(&tstatus, conm_th);
+	if (harvest_th)
+		apr_thread_join(&tstatus, harvest_th);
+	if (message_th)
+		apr_thread_join(&tstatus, message_th);
+
+	for (i = 0; i < ax.hosttabsz; i++)
+	{
+		host_t* h = &ax.hosttab[i];
+		if (h)
+		{
+			apr_thread_mutex_lock(h->mutex);
+			if (h->sock)
+				close(h->sock);
+			h->sock = 0;
+			apr_thread_mutex_unlock(h->mutex);
+			h = NULL;
+		}
+	}
+
+	if (ax.pool)
+		apr_pool_destroy(ax.pool);
+}
+
+
+/**
+ * Signal handlers
+ */
+static void SIGHUP_handler(int sig)
+{
+	/* Flag to reload configuration values from conf file */
+	ax.reload = 1;
+}
+
+static void SIGUSR2_handler(int sig)
+{
+	ax.exit = 1;
+}
+
+
+/** ------------------------------------------------------------
+ After we sent a 'D'ump command, gpsmon will send us packets thru
+ the TCP connection. This function gets called whenever a packet
+ arrives.
+ */
+static void recv_from_gx(SOCKET sock, short event, void* arg)
+{
+	host_t* h = arg;
+	int e;
+	gp_smon_to_mmon_packet_t pktbuf;
+	gp_smon_to_mmon_packet_t* pkt = 0;
+	TR2(("recv_from_gx sock %d host %s port %d\n", sock, h->hostname, ax.port));
+
+	if (event & EV_TIMEOUT)
+	{
+		// no response from gpsmon for a long time
+		// retry connecting
+		TR1(("Connection to %s timeout\n",h->hostname));
+		h->eflag = 1;
+	} 
+	else if (event & EV_READ) 
+	{
+		// reset timer of timeout event
+		struct timeval tv;
+		tv.tv_sec = 10 * gpmmon_quantum();
+		tv.tv_usec = 0;
+		if (event_add(h->event, &tv))
+		{
+			gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
+		}
+	}
+	else
+	{
+		// bad event
+		return;
+	}
+
+	apr_thread_mutex_lock(h->mutex);
+
+	if (h->event)
+	{
+		if (!h->eflag)
+		{
+			e = recvpkt(sock, &pktbuf, true);
+			if (e == APR_FROM_OS_ERROR(EINTR)) {
+				TR1(("at %s: connection dropped by host %s port %d [set eflag]\n", FLINE, h->hostname, ax.port));
+				h->eflag = 1;
+			} else if( e != 0 ) {
+				gpmon_warningx(FLINE, e, "cannot get packet from host %s port %d", h->hostname, ax.port);
+				h->eflag = 1;
+			}
+		}
+		if (h->eflag)
+		{
+			event_del(h->event);
+			h->event = 0;
+		}
+		else
+		{
+			pkt = &pktbuf;
+			TR2(("received packet %d from %s:%d\n", pkt->header.pkttype, h->hostname, ax.port));
+		}
+	}
+
+	apr_thread_mutex_unlock(h->mutex);
+	if (pkt)
+	{
+		apr_thread_mutex_lock(ax.agg_mutex);
+		e = agg_put(ax.agg, pkt);
+		apr_thread_mutex_unlock(ax.agg_mutex);
+		if (e)
+		{
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(FLINE, e, "agg_put failed");
+		}
+	}
+}
+
+/** ------------------------------------------------------------
+ This is where we setup events for TCP connections to gpsmon and
+ service those TCP connections, i.e. receiving qexec/qlog packets
+ following a dump command. We send the dump command in gpmmon_main().
+
+ Event thread:
+ Forever:
+ serve events for 2 sec
+ check if we need to set up new event
+
+ On ready:
+ if error:
+ close(event->ev_fd);
+ if readok:
+ read a packet
+ */
+static void* event_main(apr_thread_t* thread_, void* arg_)
+{
+	struct timeval tv;
+	host_t* tab = ax.hosttab;
+	const int tabsz = ax.hosttabsz;
+
+	if (!event_init())
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_init failed");
+	}
+	while (!ax.exit)
+	{
+		int i;
+		int count_valid = 0;
+
+		/* setup new events */
+		TR2(("event_main: scan hosttab\n"));
+
+		for (i = 0; i < tabsz; i++)
+		{
+			host_t* h = &tab[i];
+			SOCKET close_sock = 0;
+			apr_thread_mutex_lock(h->mutex);
+			if (h->sock)
+			{
+				if (h->eflag)
+				{
+					if (h->event)
+					{
+						event_del(h->event);
+						h->event = 0;
+					}
+					close_sock = h->sock;
+					h->sock = 0;
+				}
+				else
+				{
+					count_valid++;
+					if (!h->event)
+					{
+						/* set up the event */
+						h->event = &h->_event;
+						struct timeval tv;
+						tv.tv_sec = recv_timeout_factor * gpmmon_quantum();
+						tv.tv_usec = 0;
+						event_set(h->event, h->sock, EV_READ | EV_PERSIST | EV_TIMEOUT,
+								recv_from_gx, h);
+						if (event_add(h->event, &tv))
+						{
+							interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+							gpmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
+						}
+					}
+				}
+			}
+			apr_thread_mutex_unlock(h->mutex);
+			if (close_sock)
+			{
+				TR2(("closing socket %d\n", close_sock));
+				closesocket(close_sock);
+			}
+		}
+
+		if (count_valid == 0)
+		{
+			TR2(("no valid connection, sleep 1\n"));
+			apr_sleep(apr_time_from_sec(1));
+			continue;
+		}
+
+		/* serve events for 3 second */
+		tv.tv_sec = 3;
+		tv.tv_usec = 0;
+		TR2(("event_loopexit\n"));
+		if (-1 == event_loopexit(&tv))
+		{
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_loopexit failed");
+		}
+		TR2(("event_dispatch\n"));
+		if (-1 == event_dispatch() && ETIME != errno)
+		{
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_dispatch failed");
+		}
+		if (ax.reload == 1)
+		{
+			TR0(("sighup received, reloading conf files\n"));
+
+			ax.reload = 0;
+			/* Check if perfmon is enabled.  If not, exit */
+			if (!perfmon_enabled)
+			{
+				TR0(("Monitoring has been disabled, exiting...\n"));
+				ax.exit = 1;
+				continue;
+			}
+			else
+				read_conf_file(opt.conf_file);
+
+			TR0(("finished reloading conf files\n"));
+		}
+	}
+	return 0;
+}
+
+
+static apr_status_t conm_connect(SOCKET* retsock, apr_int32_t* retpid, const char* ipstr, int port, bool ipv6)
+{
+	struct sockaddr_in sa;
+	struct sockaddr_in6 sa6;
+	SOCKET sock = -1;
+	gp_smon_to_mmon_packet_t pkt;
+	int e = 0;
+	unsigned short family = 0;
+	struct sockaddr * sockaddrlive = NULL;
+	size_t length = 0;
+
+	memset(&pkt, 0, sizeof(gp_smon_to_mmon_packet_t));
+
+
+	if (ipv6)
+	{
+		family = AF_INET6;
+		memset(&sa6, 0, sizeof(sa6));
+		sa6.sin6_family = AF_INET6;
+		sa6.sin6_port = htons(port);
+		inet_pton(AF_INET6, ipstr, &(sa6.sin6_addr));
+		sockaddrlive = (struct sockaddr *)&sa6;
+		length = sizeof(sa6);
+	}
+	else
+	{
+		family = AF_INET;
+		memset(&sa, 0, sizeof(sa));
+		sa.sin_family = AF_INET;
+		sa.sin_addr.s_addr = inet_addr(ipstr);
+		sa.sin_port = htons(port);
+		sockaddrlive = (struct sockaddr *)&sa;
+		length = sizeof(sa);
+	}
+
+	if (-1 == (sock = socket(family, SOCK_STREAM, 0)))
+	{
+		gpmon_warningx(FLINE, 0, "error %d from socket system call", sock);
+		goto bail;
+	}
+
+	SET_MAXFD(sock);
+
+	if (-1 == connect(sock, sockaddrlive, length))
+	{
+		e = APR_FROM_OS_ERROR(errno);
+		gpmon_warningx(FLINE, e, "connect system call failed");
+		goto bail;
+	}
+
+	struct timeval tv;
+	tv.tv_sec = recv_timeout;
+	tv.tv_usec = 0;
+	if (setsockopt(sock,
+				   SOL_SOCKET,
+				   SO_RCVTIMEO,
+				   (char*)&tv,
+				   sizeof(tv)) < 0)
+	{
+		e = APR_FROM_OS_ERROR(errno);
+		gpmon_fatalx(FLINE, e, "failed to set SO_RCVTIMEO");
+		goto bail;
+	}
+
+	gp_smon_to_mmon_set_header(&pkt, GPMON_PKTTYPE_HELLO);
+	pkt.u.hello.signature = ax.signature;
+	if (0 != (e = sendpkt(sock, &pkt)))
+	{
+		gpmon_warningx(FLINE, 0, "error %d from sendpkt system call", e);
+		goto bail;
+	}
+
+	if (0 != (e = recvpkt(sock, &pkt, false))) {
+		gpmon_warningx(FLINE, 0, "error %d from recvpkt system call", e);
+		goto bail;
+	}
+
+	if (0 != (e = gpmon_ntohpkt(pkt.header.magic, pkt.header.version, pkt.header.pkttype))) {
+		gpmon_warningx(FLINE, 0, "error %d from recvpkt gpmon_ntohpkt", e);
+		goto bail;
+	}
+
+	if (pkt.header.pkttype != GPMON_PKTTYPE_HELLO)
+	{
+		gpmon_warning(FLINE, "invalid packet type");
+		e = APR_EINVAL;
+		goto bail;
+	}
+
+	/* on successful connect, save pid of gpsmon for killing it in case of hang */
+	*retpid = pkt.u.hello.pid;
+
+	*retsock = sock;
+	return 0;
+
+	bail:
+	if (sock >= 0) closesocket(sock);
+	return e;
+}
+
+/** ------------------------------------------------------------
+ Connection management thread:
+ Forever:
+ sleep 5
+ for any broken connection:
+ close socket
+ for any broken connection:
+ start gpsmon using ssh
+ for any broken connection:
+ try connect
+ */
+static void* conm_main(apr_thread_t* thread_, void* arg_)
+{
+	host_t* tab = ax.hosttab;
+	const int tabsz = ax.hosttabsz;
+	int i, j, e;
+	int* broken;
+	int count_broken = 0;
+	unsigned int loop;
+
+	broken = malloc(sizeof(*broken) * tabsz);
+	CHECKMEM(broken);
+	memset(broken, 0, sizeof(*broken) * tabsz);
+
+	for (loop = 0; !ax.exit; loop++)
+	{
+		apr_sleep(apr_time_from_sec(1));
+
+		if (1 == (loop % 2))
+		{
+			// find broken connections
+			count_broken = 0;
+			for (i = 0; i < tabsz; i++)
+			{
+				host_t* h = &tab[i];
+				apr_thread_mutex_lock(h->mutex);
+				if (h->sock == 0)
+					broken[count_broken++] = i;
+				apr_thread_mutex_unlock(h->mutex);
+			}
+		}
+
+		if (CONM_LOOP_LAUNCH_FRAME == (loop % CONM_INTERVAL))
+		{
+
+			// for any broken connection, start gpsmon
+			for (i = 0; i < count_broken;)
+			{
+				FILE* fp[BATCH];
+
+				// use these strings to formulate log location for smon without allocating any dynamic memory
+				const char* empty_string = "";
+				const char* gpperfmon_string = "/gpperfmon";
+				const char* ptr_smon_log_location;
+				const char* ptr_smon_log_location_suffix;
+
+				const int line_size = 2048;
+				char line[line_size];
+				memset(fp, 0, sizeof(fp));
+				for (j = 0; j < 8 && i < count_broken; j++, i++)
+				{
+					host_t* h = &tab[broken[i]];
+					char* active_hostname;
+
+					advance_connection_hostname(h); // if we have to connect many times to same host try a new hostname for same host
+
+					active_hostname = get_connection_hostname(h);
+
+					// smon will log to gpperfmon directory on master, specified location on smon, or one of the gpperfmon subdir of one of the data directories as default
+					if (h->is_master)
+					{
+						ptr_smon_log_location = opt.log_dir;
+						ptr_smon_log_location_suffix = empty_string;
+					}
+					else if (opt.smon_log_dir)
+					{
+						ptr_smon_log_location = opt.smon_log_dir;
+						ptr_smon_log_location_suffix = empty_string;
+					}
+					else
+					{
+						ptr_smon_log_location = h->data_dir;
+						ptr_smon_log_location_suffix = gpperfmon_string;
+					}
+
+					const int kill_cmd_size = 1024;
+					char kill_gpsmon[kill_cmd_size];
+					memset(kill_gpsmon, 0, kill_cmd_size);
+					if (h->connect_timeout == GPSMON_TIMEOUT_RESTART && h->pid > 0)
+					{
+						snprintf(kill_gpsmon, kill_cmd_size, "kill -9 %d;", h->pid);
+						apr_thread_mutex_lock(h->mutex);
+						h->pid = 0; /* don't try kill gpsmon repeatly */
+						h->connect_timeout = GPSMON_TIMEOUT_NONE; /* try reconnect immediately */
+						apr_thread_mutex_unlock(h->mutex);
+					}
+
+					if (h->smon_bin_location) { //if this if filled, then use it as the directory for smon istead of the default
+						snprintf(line, line_size, "ssh -v -o 'BatchMode yes' -o 'StrictHostKeyChecking no'"
+								" %s '%s echo -e \"%" APR_INT64_T_FMT "\\n\\n\" | %s -m %" FMT64 " -t %" FMT64 " -l %s%s -v %d %d' 2>&1",
+								active_hostname, kill_gpsmon, ax.signature, h->smon_bin_location, opt.max_log_size, smon_terminate_timeout, ptr_smon_log_location, ptr_smon_log_location_suffix, opt.v, ax.port);
+					} else {
+						snprintf(line, line_size, "ssh -v -o 'BatchMode yes' -o 'StrictHostKeyChecking no'"
+								" %s '%s echo -e \"%" APR_INT64_T_FMT "\\n\\n\" | %s/bin/gpsmon -m %" FMT64 " -t %" FMT64 " -l %s%s -v %d %d' 2>&1",
+								active_hostname, kill_gpsmon, ax.signature, ax.gphome, opt.max_log_size, smon_terminate_timeout, ptr_smon_log_location, ptr_smon_log_location_suffix, opt.v, ax.port);
+
+					}
+
+					if (h->ever_connected)
+					{
+						TR0(("Connection to %s lost.  Restarting gpsmon.\n", active_hostname));
+					}
+					else
+					{
+						TR0(("Making initial connection to %s\n", active_hostname));
+					}
+					h->ever_connected = 1;
+
+					TR1(("%s\n", line));
+					fp[j] = popen(line, "r");
+					if (fp[j])
+					{
+						SET_MAXFD(fileno(fp[j]));
+					}
+					else
+					{
+						TR0(("Call popen failed due to %s\n", strerror(errno)));
+					}
+				}
+				for (j = 0; j < 8; j++)
+				{
+					if (fp[j])
+					{
+						TR1_FILE(("Debugging logs of ssh", fp[j]));
+						pclose(fp[j]);
+					}
+				}
+			}
+		}
+
+		// for any broken/timeout connection, try connect
+		bool try_connect_normal = (CONM_LOOP_BROKEN_FRAME == (loop % CONM_INTERVAL));
+		bool try_connect_hang = (CONM_LOOP_HANG_FRAME == (loop % CONM_INTERVAL));
+		if (try_connect_normal || try_connect_hang)
+		{
+			for (i = 0; i < count_broken; i++)
+			{
+				host_t* h = &tab[broken[i]];
+				if(GPSMON_TIMEOUT_DETECTED == h->connect_timeout)
+				{
+					/* In next loop will begin to restart and reconnect gpsmon */
+					apr_thread_mutex_lock(h->mutex);
+					h->connect_timeout = GPSMON_TIMEOUT_RESTART;
+					apr_thread_mutex_unlock(h->mutex);
+					continue;
+				}
+				if(GPSMON_TIMEOUT_NONE == h->connect_timeout && !try_connect_normal)
+				{
+					continue;
+				}
+				if(GPSMON_TIMEOUT_RESTART == h->connect_timeout && !try_connect_hang)
+				{
+					continue;
+				}
+				SOCKET sock = 0;
+				apr_int32_t gpsmon_pid = 0;
+				char* active_hostname = get_connection_hostname(h);
+				char* active_ip = get_connection_ip(h);
+				bool ipv6 = get_connection_ipv6_status(h);
+
+				TR1(("connecting to %s (%s:%d)\n", active_hostname, active_ip, ax.port));
+				if (0 != (e = conm_connect(&sock, &gpsmon_pid, active_ip, ax.port, ipv6)))
+				{
+					gpmon_warningx(FLINE, 0, "cannot connect to %s (%s:%d)",
+							active_hostname, active_ip, ax.port);
+					if (APR_ETIMEDOUT == e) /* connection timeout */
+					{
+						if (GPSMON_TIMEOUT_RESTART == h->connect_timeout)
+						{
+							gpmon_warning(FLINE, "Failed to reconnect gpsmon on %s, maybe network isolation or other process occupied the port", active_hostname);
+						}
+						else if (GPSMON_TIMEOUT_NONE == h->connect_timeout)
+						{
+							/* Mark the host as timeout, push it behind normal host */
+							apr_thread_mutex_lock(h->mutex);
+							h->connect_timeout = GPSMON_TIMEOUT_DETECTED;
+							apr_thread_mutex_unlock(h->mutex);
+						}
+					}
+					else
+					{
+						apr_thread_mutex_lock(h->mutex);
+						h->connect_timeout = GPSMON_TIMEOUT_NONE;
+						apr_thread_mutex_unlock(h->mutex);
+					}
+					continue;
+				}
+				/* connected - set it to valid */
+				TR1(("connected to %s (%s:%d), pid %d\n", active_hostname, active_ip, ax.port, gpsmon_pid));
+				apr_thread_mutex_lock(h->mutex);
+				h->sock = sock;
+				h->event = 0;
+				h->eflag = 0;
+				h->connect_timeout = GPSMON_TIMEOUT_NONE;
+				h->pid = gpsmon_pid;
+				apr_thread_mutex_unlock(h->mutex);
+			}
+		}
+	}
+	return 0;
+}
+
+/* seperate thread for harvest */
+static void* harvest_main(apr_thread_t* thread_, void* arg_)
+{
+	unsigned int loop;
+	apr_status_t status;
+	unsigned int consecutive_failures = 0;
+	unsigned int partition_check_interval = 3600 * 6; // check for new partitions every 6 hours
+
+	gpdb_check_partitions(&opt);
+
+	for (loop = 1; !ax.exit; loop++)
+	{
+		apr_sleep(apr_time_from_sec(1));
+
+		if (0 == (loop % opt.harvest_interval))
+		{
+			int e;
+			/*
+				PROCESS:
+				1) WITH TAIL MUTEX: rename tail files to stage files
+				2) WITH TAIL MUTEX: create new tail files
+				3) Append data from stage files into _tail files
+				4) load data from _tail files into system history
+				5) delete _tail files after successful data load
+
+				NOTES:
+				1) mutex is held only over rename and creation of new tail files
+				   The reason is this is a fast operation and both the main thread doing dumps and the harvest
+				   thread uses the tail files
+
+				2) tail files are renamed/moved because this operation is safe to be done while the clients are
+				   reading the files.
+
+				3) The stage files are written over every harvest cycle, so the idea is no client
+				   will still be reading the tail files for an entire harvest cycle.  (this is not perfect logic but ok)
+			*/
+
+			apr_pool_t* pool; /* create this pool so we can destroy it each loop */
+
+
+			if (0 != (e = apr_pool_create_alloc(&pool, ax.pool)))
+			{
+				interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+				gpmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
+				return (void*)1;
+			}
+
+			/* LOCK TAIL MUTEX ****************************************/
+			apr_thread_mutex_lock(ax.tailfile_mutex);
+
+			ax._tail_buffer_bytes += ax.tail_buffer_bytes;
+			ax.tail_buffer_bytes = 0;
+
+			status = gpdb_rename_tail_files(pool);
+
+			status = gpdb_truncate_tail_files(pool);
+
+			apr_thread_mutex_unlock(ax.tailfile_mutex);
+			/* UNLOCK TAIL MUTEX ****************************************/
+
+			status = gpdb_copy_stage_to_harvest_files(pool);
+
+			if (status == APR_SUCCESS)
+			{
+				status = gpdb_harvest();
+			}
+
+			if (status != APR_SUCCESS)
+			{
+				gpmon_warningx(FLINE, 0, "harvest failure: accumulated tail file size is %lu bytes", ax._tail_buffer_bytes);
+				consecutive_failures++;
+			}
+
+			if (status == APR_SUCCESS ||
+					consecutive_failures > 100 ||
+					(ax._tail_buffer_bytes > opt.tail_buffer_max))
+			{
+				/*
+					delete the data in the _tail file because it has been inserted successfully into history
+					we also delete the data without loading if it loading failed more than 100 times as a defensive measure for corrupted data in the file
+					better to lose some perf data and self fix than calling support to have them manually find the corrupted perf data and clear it themselves
+					we also delete the data if the size of the data is greater than our max data size
+				*/
+				status = gpdb_empty_harvest_files(pool);
+
+				if (status != APR_SUCCESS)
+				{
+					gpmon_warningx(FLINE, 0, "error trying to clear harvest files");
+				}
+				consecutive_failures = 0;
+				ax._tail_buffer_bytes = 0;
+			}
+			//gpdb_import_alert_log(pool);
+			apr_pool_destroy(pool); /*destroy the pool since we are done with it*/
+		}
+		if (0 == (loop % partition_check_interval))
+		{
+			gpdb_check_partitions(&opt);
+		}
+	}
+
+	return APR_SUCCESS;
+}
+
+
+/* Separate thread for message sending */
+/* As gp_elog has been removed, disable this function */
+/*
+static void* message_main(apr_thread_t* thread_, void* arg_)
+{
+	apr_queue_t *queue = arg_;
+	void *query = NULL;
+	apr_status_t status;
+
+	TR2(("In message_main: error_disk_space_percentage = %d, warning_disk_space_percentage = %d, disk_space_interval = %d, max_disk_space_messages_per_interval = %d\n",
+		 opt.error_disk_space_percentage, opt.warning_disk_space_percentage, (int) opt.disk_space_interval, opt.max_disk_space_messages_per_interval));
+	while (1)
+	{
+		query = NULL;
+		status = apr_queue_pop(queue, &query);
+		if (status == APR_EINTR)
+		{ //the blocking operation was interrupted (try again)
+			continue;
+		}
+		else if (status != APR_SUCCESS)
+		{
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(
+				FLINE, status, "message_main ERROR: apr_queue_pop failed: returned %d", status);
+			return (void*)1;
+		}
+		else if (NULL == query)
+		{
+			TR0(("message_main ERROR: apr_queue_pop returned NULL\n"));
+		}
+		else
+		{ // send the message
+			if (!gpdb_exec_search_for_at_least_one_row((const char *)query, NULL))
+			{
+				TR0(("message_main ERROR: query %s failed. Cannot send message\n", (char *) query));
+			}
+			free(query);
+		}
+
+	}
+	return APR_SUCCESS;
+}
+*/
+
+time_t compute_next_dump_to_file()
+{
+	time_t current_time = time(NULL);
+	return (current_time - (current_time % opt.quantum) + opt.quantum);
+}
+
+static void gpmmon_main(void)
+{
+	int e;
+	apr_status_t retCode;
+	apr_threadattr_t* ta;
+	time_t this_cycle_ts = 0;
+	/* log check is not exact. do it every X loops */
+	int ticks_since_last_log_check = 0;
+	const unsigned int log_check_interval = 60;
+
+	const int safety_ticks = 2 * opt.quantum;
+	unsigned int dump_request_time_allowance = opt.quantum / 2;
+
+	/* DUMP TO FILE */
+	time_t next_dump_to_file_ts;
+	int dump_to_file_safety_ticks = safety_ticks;
+
+	/* SEND MESSAGE  */
+	time_t next_send_msg_ts;
+	int send_msg_safety_ticks = safety_ticks;
+
+	/* init timestamps */
+	next_dump_to_file_ts = compute_next_dump_to_file();
+	next_send_msg_ts = next_dump_to_file_ts - dump_request_time_allowance;
+
+	/* TODO: MPP-3974 might have actually been caused by the spin lock problem... investigate */
+	setenv("EVENT_NOKQUEUE", "1", 1);
+	setenv("EVENT_NOEVPORT", "1", 1);
+	setenv("EVENT_SHOW_METHOD", "1", 1);
+	/* MPP-3974.  Hawks systems don't like devpoll */
+	setenv("EVENT_NODEVPOLL", "1", 1);
+
+	if (0 != (e = apr_pool_create_alloc(&ax.pool, 0)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
+	}
+
+	if (0 != (e = apr_env_get(&ax.gphome, "GPHOME", ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "GPHOME environment variable not set");
+	}
+
+	/* Create mutexes */
+	if (0 != (e = apr_thread_mutex_create(&ax.agg_mutex, APR_THREAD_MUTEX_UNNESTED, ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "Resource Error: Failed to create agg_mutex");
+	}
+
+	if (0 != (e = apr_thread_mutex_create(&ax.tailfile_mutex, APR_THREAD_MUTEX_UNNESTED, ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "Resource Error: Failed to create tailfile_mutex");
+	}
+
+	if (0 != (e = apr_thread_mutex_create(&logfile_mutex, APR_THREAD_MUTEX_UNNESTED, ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "Resource Error: Failed to create logfile_mutex");
+	}
+
+	if (0 != (e = apr_threadattr_create(&ta, ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "apr_threadattr_create failed");
+	}
+
+	if (0 != (e = apr_threadattr_detach_set(ta, 1)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "apr_threadattr_detach_set failed");
+	}
+
+	// generate signature
+	// this used to use apr_generate_random_bytes but that hangs on entropy in the system being available
+	// this is not used for security or protecting against attacks, so a simpler random number will do
+	srand(time(NULL));
+	ax.signature = rand();
+	ax.signature <<= 32;
+	ax.signature += rand();
+	if (ax.signature < 0)
+		ax.signature = ~ax.signature;
+
+	/* make sure to update the partition tables once before starting all the threads */
+	retCode = gpdb_check_partitions(&opt);
+	if (retCode != APR_SUCCESS)
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "failed while initializing historical tables with current month partitions");
+	}
+
+	/* get hostlist */
+	gethostlist();
+
+	/* create the persistent fsinfo hash table */
+	ax.fsinfotab = apr_hash_make(ax.pool);
+	if (!ax.fsinfotab)
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "apr_hash_make for fsinfo hash table failed");
+	}
+
+	/* create the agg */
+	if (0 != (e = agg_create(&ax.agg, 1, ax.pool, ax.fsinfotab)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "agg_create failed");
+	}
+
+	/* spawn conm thread */
+	if (0 != (e = apr_thread_create(&conm_th, ta, conm_main, 0, ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "apr_thread_create failed");
+	}
+
+	/* spawn event thread */
+	if (0 != (e = apr_thread_create(&event_th, ta, event_main, 0, ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "apr_thread_create failed");
+	}
+
+	/* spawn harvest thread */
+	if (0 != (e = apr_thread_create(&harvest_th, ta, harvest_main, 0, ax.pool)))
+	{
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, e, "apr_thread_create failed");
+	}
+
+	/* gp_elog has been removed in hashdata-lightning */
+	///* Create message queue */
+	//if (0 != (e = apr_queue_create(&message_queue, MAX_MESSAGES_PER_INTERVAL, ax.pool)))
+	//{
+	//	interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+	//	gpmon_fatalx(FLINE, e, "apr_queue_create failed");
+	//}
+
+	///* spawn disk space message thread */
+	//if (0 != (e = apr_thread_create(&message_th, ta, message_main, message_queue, ax.pool)))
+	//{
+	//	interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+	//	gpmon_fatalx(FLINE, e, "apr_thread_create failed");
+	//}
+
+	/* main loop */
+	while (!ax.exit)
+	{
+		apr_sleep(apr_time_from_sec(1));
+
+		this_cycle_ts = time(NULL);
+		send_msg_safety_ticks--;
+		dump_to_file_safety_ticks--;
+		ticks_since_last_log_check++;
+
+		/* SEND MESSAGE */
+		if ((this_cycle_ts >= next_send_msg_ts) || (send_msg_safety_ticks < 1))
+		{
+			int i;
+			for (i = 0; i < ax.hosttabsz; i++)
+			{
+				host_t* h = &ax.hosttab[i];
+				apr_thread_mutex_lock(h->mutex);
+				/* only send to entries with a socket, handling events, and no error */
+				TR1(("send dump %d eflag %d\n", h->sock, h->eflag));
+				if (h->sock && h->event && !h->eflag)
+				{
+					if (1 != send(h->sock, "D", 1, 0))
+					{
+						h->eflag = 1;
+						TR1(("at %s: cannot send 'D'ump command [set eflag]\n", FLINE));
+					}
+				}
+				apr_thread_mutex_unlock(h->mutex);
+			}
+
+			send_msg_safety_ticks = safety_ticks;
+			next_send_msg_ts = this_cycle_ts + opt.quantum;
+		}
+
+		/* DUMP TO FILE */
+		if ((this_cycle_ts >= next_dump_to_file_ts) || (dump_to_file_safety_ticks < 1))
+		{
+			agg_t* newagg = 0;
+			agg_t* oldagg = 0;
+
+			/* mutex lock the aggregate data while we dump and dupe it */
+			apr_thread_mutex_lock(ax.agg_mutex);
+
+			/* mutex tail files during dump call */
+			apr_thread_mutex_lock(ax.tailfile_mutex);
+
+			/* dump the current aggregates */
+			if (0 != (e = agg_dump(ax.agg)))
+			{
+				gpmon_warningx(FLINE, e, "unable to finish aggregation");
+			}
+
+			apr_thread_mutex_unlock(ax.tailfile_mutex);
+
+			/* make a new one, copy only recently updated entries */
+			if (0 != (e = agg_dup(&newagg, ax.agg, ax.pool, ax.fsinfotab)))
+			{
+				interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+				gpmon_fatalx(FLINE, e, "agg_dup failed");
+			}
+			oldagg = ax.agg;
+			ax.agg = newagg;
+
+			apr_thread_mutex_unlock(ax.agg_mutex);
+			/* destroy the old agg */
+			agg_destroy(oldagg);
+
+			next_dump_to_file_ts = compute_next_dump_to_file();
+			next_send_msg_ts = next_dump_to_file_ts - dump_request_time_allowance;
+			dump_to_file_safety_ticks = safety_ticks;
+		}
+
+		if (!opt.console && (ticks_since_last_log_check > log_check_interval))
+		{
+			apr_finfo_t finfo;
+			//it is ok to use the parent pool here b/c it is not for used for allocation in apr_stat
+			if (0 == apr_stat(&finfo, mmon_log_filename, APR_FINFO_SIZE, ax.pool))
+			{
+				if (opt.max_log_size != 0 && finfo.size > opt.max_log_size)
+				{
+					update_mmonlog_filename();
+					apr_thread_mutex_lock(logfile_mutex);
+					freopen(mmon_log_filename, "w", stdout);
+					apr_thread_mutex_unlock(logfile_mutex);
+				}
+			}
+			ticks_since_last_log_check = 0;
+		}
+	}
+}
+
+static void print_version(void)
+{
+	fprintf(stdout, GPMMON_PACKET_VERSION_STRING);
+}
+
+static int read_conf_file(char *conffile)
+{
+	char buffer[1024] = { 0 };
+	char *p = NULL;
+	FILE *fp = fopen(conffile, "r");
+	int section = 0, section_found = 0;
+
+	opt.quantum = quantum;
+	opt.min_query_time = min_query_time;
+	opt.harvest_interval = 120;
+	opt.max_log_size = 0;
+	opt.log_dir = strdup(DEFAULT_GPMMON_LOGDIR);
+	opt.max_disk_space_messages_per_interval = MAX_MESSAGES_PER_INTERVAL;
+	opt.disk_space_interval = (60*MINIMUM_MESSAGE_INTERVAL);
+	opt.partition_age = 0;
+
+	if (!fp)
+	{
+		fprintf(stderr, "Performance Monitor - Error: Failed to open configuration file.  Using defaults.");
+		return 0;
+	}
+
+	while (NULL != fgets(buffer, 1024, fp))
+	{
+		/* remove new line */
+		p = gpmon_trim(buffer);
+
+		if (p[0] == '[') /* Start of section */
+		{
+			if (apr_strnatcasecmp(p, "[gpmmon]") == 0)
+				section = section_found = 1;
+			else
+				section = 0;
+		}
+		else /* config param */
+		{
+			char *pName = NULL, *pVal = NULL, *pTemp = NULL;
+			/* is it a comment? */
+			pTemp = p;
+			while (pTemp && *pTemp)
+			{
+				if (*pTemp == '#')
+				{
+					*pTemp = '\0';
+					break;
+				}
+				pTemp++;
+			}
+
+			pName = strtok(buffer, "=");
+			pVal = strtok(NULL, "=");
+
+			if (section == 0 || buffer[0] == 0 || pName == NULL || pVal == NULL)
+				continue;
+
+			pName = gpmon_trim(pName);
+			pVal = gpmon_trim(pVal);
+
+			if (apr_strnatcasecmp(pName, "quantum") == 0)
+			{
+				opt.quantum = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "harvest_interval") == 0)
+			{
+				opt.harvest_interval = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "min_query_time") == 0)
+			{
+				opt.min_query_time = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "verbose") == 0)
+			{
+				opt.v = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "qamode") == 0)
+			{
+				/* this will allow QA to make config settings that are normally illegal */
+				opt.qamode = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "console") == 0)
+			{
+				/* this will disable logging to log files */
+				opt.console = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "log_location") == 0)
+			{
+				/* can't use APR here as the pool is just temporary */
+				if (opt.log_dir)
+					free(opt.log_dir);
+
+				opt.log_dir = strdup(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "smon_log_location") == 0)
+			{
+				if (opt.smon_log_dir)
+					free(opt.smon_log_dir);
+
+				opt.smon_log_dir = strdup(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "hadoop_hostfile") == 0)
+			{
+				if (opt.smon_hadoop_swonly_clusterfile)
+					free(opt.smon_hadoop_swonly_clusterfile);
+				opt.smon_hadoop_swonly_clusterfile = strdup(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "hadoop_logdir") == 0)
+			{
+				if (opt.smon_hadoop_swonly_logdir)
+					free(opt.smon_hadoop_swonly_logdir);
+				opt.smon_hadoop_swonly_logdir = strdup(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "hadoop_smon_path") == 0)
+			{
+				if (opt.smon_hadoop_swonly_binfile)
+					free(opt.smon_hadoop_swonly_binfile);
+				opt.smon_hadoop_swonly_binfile = strdup(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "smdw_aliases") == 0)
+			{
+				opt.smdw_aliases = strdup(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "tail_buffer_max") == 0)
+			{
+				opt.tail_buffer_max = apr_atoi64(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "max_log_size") == 0)
+			{
+				opt.max_log_size = apr_atoi64(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "warning_disk_space_percentage") == 0)
+			{
+				opt.warning_disk_space_percentage = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "error_disk_space_percentage") == 0)
+			{
+				opt.error_disk_space_percentage = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "disk_space_interval") == 0)
+			{
+				opt.disk_space_interval = (time_t) (atoi(pVal)*60); //interval in seconds but set in minutes, so multiply
+			}
+			else if (apr_strnatcasecmp(pName, "max_disk_space_messages_per_interval") == 0)
+			{
+				opt.max_disk_space_messages_per_interval = atoi(pVal);
+			}
+			else if (apr_strnatcasecmp(pName, "partition_age") == 0)
+			{
+				opt.partition_age = atoi(pVal);
+			}
+			else
+			{
+				fprintf(stderr, "Unknown option %s\n", pName == NULL ? "(NULL)"
+						: pName);
+			}
+		}
+	}
+
+	/* check for valid entries */
+	if (!section_found)
+		fprintf(stderr, "Performance Monitor - Failed to find [gpmmon] section in the "
+				"configuration file.  Using default values\n");
+
+	if (opt.quantum !=5 && opt.quantum != 10 && opt.quantum != 15 && opt.quantum != 20 && opt.quantum != 30 && opt.quantum != 60)
+	{
+		fprintf(stderr, "Performance Monitor - quantum value must be be either 10, 15, 20, 30 or 60.  Using "
+				"default value of 15\n");
+		opt.quantum = 15;
+	}
+
+	if (opt.min_query_time < 0)
+		opt.min_query_time = 0;
+
+	if (opt.log_dir == NULL)
+	{
+		char log_dir[MAXPATHLEN + 1] = { 0 };
+		snprintf(log_dir, MAXPATHLEN, "%s/%s", ax.master_data_directory,
+				"gpperfmon/logs/");
+		opt.log_dir = strdup(log_dir);
+	}
+
+	if (opt.harvest_interval < 30 && !opt.qamode)
+	{
+		fprintf(stderr, "Performance Monitor - harvest_interval must be greater than 30.  Using "
+				"default value 120\n");
+		opt.harvest_interval = 120;
+	}
+
+	if (opt.warning_disk_space_percentage < 0 || opt.warning_disk_space_percentage >= 100)
+	{
+		fprintf(stderr, "Performance Monitor - warning_disk_space_percentage must be between 1 and 100.  Disabling.\n");
+		opt.warning_disk_space_percentage = 0;
+	}
+
+
+	if (opt.error_disk_space_percentage < 0 || opt.error_disk_space_percentage >= 100)
+	{
+		fprintf(stderr, "Performance Monitor - error_disk_space_percentage must be between 1 and 100.  Disabling.\n");
+		opt.error_disk_space_percentage = 0;
+	}
+
+	if (opt.error_disk_space_percentage < opt.warning_disk_space_percentage) {
+		fprintf(stderr, "Performance Monitor - error_disk_space_percentage less than warning_disk_space_percentage, so setting to warning_disk_space_percentage.\n");
+		opt.error_disk_space_percentage = opt.warning_disk_space_percentage;
+	}
+
+	if (opt.max_disk_space_messages_per_interval > MAX_MESSAGES_PER_INTERVAL) {
+		fprintf(stderr, "Performance Monitor - max_disk_space_messages_per_interval must be not be greater than %d.  Setting to %d.\n",MAX_MESSAGES_PER_INTERVAL, MAX_MESSAGES_PER_INTERVAL );
+		opt.max_disk_space_messages_per_interval = MAX_MESSAGES_PER_INTERVAL;
+	} else if (opt.max_disk_space_messages_per_interval < MIN_MESSAGES_PER_INTERVAL) {
+		fprintf(stderr, "Performance Monitor - max_disk_space_messages_per_interval must be not be less than %d.  Setting to %d.\n",MIN_MESSAGES_PER_INTERVAL, MIN_MESSAGES_PER_INTERVAL );
+		opt.max_disk_space_messages_per_interval = MIN_MESSAGES_PER_INTERVAL;
+	}
+
+	if (opt.disk_space_interval < (60 *MINIMUM_MESSAGE_INTERVAL) ) {
+		fprintf(stderr, "Performance Monitor - disk_space_interval must be not be less than %d minute.  Setting to %d minute.\n",MINIMUM_MESSAGE_INTERVAL, MINIMUM_MESSAGE_INTERVAL );
+		opt.disk_space_interval = (60 *MINIMUM_MESSAGE_INTERVAL);
+	} else if (opt.disk_space_interval > (60 *MAXIMUM_MESSAGE_INTERVAL) ) {
+		fprintf(stderr, "Performance Monitor - disk_space_interval must be not be greater than than %d minutes.  Setting to %d minutes.\n",MAXIMUM_MESSAGE_INTERVAL, MAXIMUM_MESSAGE_INTERVAL );
+		opt.disk_space_interval = (60 *MAXIMUM_MESSAGE_INTERVAL);
+	}
+
+
+
+	if (opt.tail_buffer_max == 0)
+	{
+		opt.tail_buffer_max = (1LL << 31); /* 2GB */
+	}
+
+	smon_terminate_timeout = opt.quantum * smon_terminate_safe_factor;
+	recv_timeout = opt.quantum * recv_timeout_factor;
+	verbose = opt.v;
+	min_query_time = opt.min_query_time;
+	quantum = opt.quantum;
+
+	fclose(fp);
+	return 0;
+}
+
+void interuptable_sleep(unsigned int seconds)
+{
+	int i;
+	for (i = 0; i < seconds && !ax.exit; i++)
+	    apr_sleep(apr_time_from_sec(1));
+
+	if (ax.exit)
+		exit(0);
+}
+
+int perfmon_main(Datum arg)
+{
+	int db_check_count = 0;
+	char port[6];
+
+
+	if (apr_initialize())
+	{
+		fprintf(stderr, "Performance Monitor - Internal error, failed to initialize APR.\n");
+		interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		exit(1);
+	}
+
+	/* init conf_file and gpdb_port */
+	opt.conf_file = (char *)malloc(sizeof(char) * (MAXPGPATH + 1));
+	memset(opt.conf_file, 0 , sizeof(char) * (MAXPGPATH + 1));
+	snprintf(opt.conf_file, MAXPGPATH, "%s/gpperfmon/conf/gpperfmon.conf",
+				DataDir);
+
+	memset(port, 0, sizeof(port));
+	snprintf(port, 5, "%d", PostPortNumber);
+	opt.gpdb_port = port;
+
+	/* Set env if we got a port.  This will be picked up by libpq */
+	if (opt.gpdb_port)
+		setenv("PGPORT", opt.gpdb_port, 1);
+
+	/* set bgworker signal */
+	pqsignal(SIGHUP, SIGHUP_handler);
+	pqsignal(SIGUSR2, SIGUSR2_handler);
+	/* we're now ready to receive signals */
+	BackgroundWorkerUnblockSignals();
+
+	/* Check for gpperfmon database.  If it doesn't exists,
+	 * hang around until it does or we get a stop request */
+	for (;;)
+	{
+		int gpperfmon_valid = 0;
+
+		if (ax.exit)
+			exit(0);
+
+		gpperfmon_valid = gpdb_validate_gpperfmon();
+		if (!gpperfmon_valid)
+		{
+			/* Don't want to fill up the log with these messages,
+			 * so only log it once every 5 minutes */
+			if (db_check_count % 5 == 0)
+				fprintf(stderr, "Performance Monitor - There was a problem "
+								"accessing the gpperfmon database.");
+
+			db_check_count += 1;
+
+			interuptable_sleep(60); // sleep to prevent loop of forking process and failing
+		}
+		else
+			break;
+	}
+
+	getconfig();
+	read_conf_file(opt.conf_file);
+
+	gpmon_warning(FLINE, "read config file");
+	/* redirect output to log_file */
+	/* stdout goes to log: debug and warnings */
+	/* stderr goes to pg_log: fatal errors */
+	{
+		if (gpmon_recursive_mkdir(opt.log_dir))
+		{
+			fprintf(stderr, "\nPerformance Monitor -- cannot create directory %s", opt.log_dir);
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "cannot create directory %s", opt.log_dir);
+		}
+
+		update_mmonlog_filename();
+		if (!opt.console && !freopen(mmon_log_filename, "w", stdout))
+		{
+			fprintf(stderr, "\nPerformance Monitor -- failed to open perfmon log file %s\n", mmon_log_filename);
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatal(FLINE, "\nfailed (1) to open perfmon log file %s\n", mmon_log_filename);
+		}
+		TR0(("starting mmon logging\n"));
+	}
+	/* check port */
+	if (!(0 < ax.port && ax.port < (1 << 16)))
+	{
+		usage("Error: invalid port number");
+	}
+
+	/* check that we are indeed running in a postgres data directory */
+	{
+		FILE* fp = fopen("pg_hba.conf", "r");
+		if (!fp)
+		{
+			usage("Error: master data directory is not valid; can't find pg_hba.conf");
+		}
+		SET_MAXFD(fileno(fp));
+		fclose(fp);
+	}
+
+	/* start up gpperfmon directory */
+	{
+		char work_dir[MAXPATHLEN+1] = {0};
+		strncpy(work_dir, GPMON_DIR, MAXPATHLEN);
+		work_dir[MAXPATHLEN] = 0;
+
+		if (gpmon_recursive_mkdir(work_dir))
+		{
+			fprintf(stderr, "\ncannot create directory %s/%s", ax.master_data_directory, GPMON_DIR);
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "cannot create directory %s/%s", ax.master_data_directory, GPMON_DIR);
+		}
+	}
+
+	create_log_alert_table();
+	gpmmon_main();
+
+	cleanup();
+
+	return 0;
+}
+
+void populate_smdw_aliases(host_t* host)
+{
+	char* saveptr;
+	char* token;
+
+	if (!opt.smdw_aliases)
+	{
+		return;
+	}
+
+	token = strtok_r(opt.smdw_aliases, ",", &saveptr);
+	while (token)
+	{
+		if (!host->addressinfo_tail)
+		{
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(FLINE, 0, "smdw addressname structure is inconsistent");
+		}
+
+		// permenant memory for address list -- stored for duration
+		host->addressinfo_tail->next = calloc(1, sizeof(addressinfo_holder_t));
+		CHECKMEM(host->addressinfo_tail);
+
+		host->addressinfo_tail = host->addressinfo_tail->next;
+
+		host->addressinfo_tail->address = strdup(token);
+		CHECKMEM(host->addressinfo_tail->address);
+
+		host->address_count++;
+
+		token = strtok_r(NULL, ",", &saveptr);
+	}
+}
+
+// returnParamIsIpv6 will be set to true for IPv6 addresses
+// the actual IP address string is returned from the function
+char* get_ip_for_host(char* host, bool* returnParamIsIpv6)
+{
+	char * ipstr;
+	int			ret;
+	struct addrinfo *addrs = NULL;
+	struct addrinfo hint;
+
+	/* Initialize hint structure */
+	memset(&hint, 0, sizeof(hint));
+	hint.ai_socktype = SOCK_STREAM; /* TCP */
+	hint.ai_family = AF_UNSPEC;	/* Allow for any family */
+
+	ret = getaddrinfo(host, NULL, &hint, &addrs);
+	if (ret || !addrs)
+	{
+		freeaddrinfo(addrs);
+
+		gpmon_fatalx(FLINE, 0, "getaddrinfo returned %s",  gai_strerror(ret));
+		return NULL;
+	}
+
+	ipstr = malloc(128);
+	/* just grab the first address... it should be fine */
+	if (addrs->ai_family == AF_INET)
+	{
+		struct sockaddr_in* sock = (struct sockaddr_in*)addrs->ai_addr;
+		inet_ntop(addrs->ai_family, &sock->sin_addr, ipstr, 128);
+		*returnParamIsIpv6 = false;
+	}
+	else if (addrs->ai_family == AF_INET6)
+	{
+		struct sockaddr_in6* sock = (struct sockaddr_in6*)addrs->ai_addr;
+		inet_ntop(addrs->ai_family, &sock->sin6_addr, ipstr, 128);
+		*returnParamIsIpv6 = true;
+	}
+	else
+	{
+	    interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+		gpmon_fatalx(FLINE, 0, "Bad address family for host: %s %d", host, addrs->ai_family);
+	}
+	if (addrs)
+		freeaddrinfo(addrs);
+
+	return ipstr;
+}
+
+#define GPMMON_GETHOSTLIST_LINE_BUFSIZE 1024
+static void gethostlist()
+{
+	int i = 0;
+
+	/* Connect to database, get segment hosts from gp_segment_configuration */
+	gpdb_get_hostlist(&ax.hosttabsz, &ax.hosttab, ax.pool, &opt);
+
+	for (i = 0; i < ax.hosttabsz; ++i)
+	{
+		addressinfo_holder_t* addressinfo;
+
+		// there are potentially more hostnames for standby master
+		// specified in the config file
+		if (ax.standby_master_hostname && strcmp(ax.standby_master_hostname, ax.hosttab[i].hostname) == 0)
+	  	{
+			populate_smdw_aliases(&ax.hosttab[i]);
+		}
+
+		addressinfo = ax.hosttab[i].addressinfo_head;
+		while (addressinfo)
+		{
+			addressinfo->ipstr = get_ip_for_host(addressinfo->address, &addressinfo->ipv6);
+			if (!addressinfo->ipstr)
+			{
+				interuptable_sleep(60); // sleep to prevent loop of forking process and failing
+				gpmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "cannot convert host %s to IP", addressinfo->address);
+			}
+			addressinfo = addressinfo->next;
+		}
+	}
+
+	// SANITY TEST AND DEBUG PRINT
+	TR0(("found %d unique live hosts from catalog\n", ax.hosttabsz));
+
+	for (i = 0; i < ax.hosttabsz; i++)
+	{
+		addressinfo_holder_t* addressinfo;
+		int counter = 0;
+
+		TR0(("HOST: (hostname %s) (is_master %d) (datadir %s) (host_alias_count %d) (hdm %d) (hdw %d) (hbw %d) (hdc %d) (dia %d)\n",
+			ax.hosttab[i].hostname,
+			ax.hosttab[i].is_master,
+			ax.hosttab[i].data_dir,
+			ax.hosttab[i].address_count,
+			ax.hosttab[i].is_hdm,
+			ax.hosttab[i].is_hdw,
+			ax.hosttab[i].is_hbw,
+			ax.hosttab[i].is_hdc,
+			ax.hosttab[i].is_etl));
+
+		addressinfo = ax.hosttab[i].addressinfo_head;
+		while (addressinfo)
+		{
+			// extra sanity checking
+			counter++;
+			if (counter > ax.hosttab[i].address_count)
+			{
+				gpmon_fatalx(FLINE, 0, "address counter exceeds number of addresses for host %s", ax.hosttab[i].hostname);
+			}
+
+			const char* ipv6on = NULL;
+			if (addressinfo->ipv6)
+				ipv6on = YES_TEXT;
+			else
+				ipv6on = NO_TEXT;
+
+			TR1(("\tALIAS: (host %s) (ipstr %s) (ipv6 %s)\n", addressinfo->address, addressinfo->ipstr, ipv6on));
+			addressinfo = addressinfo->next;
+		}
+	}
+
+
+}
+
+/* send a packet thru sock */
+static apr_status_t sendpkt(int sock, const gp_smon_to_mmon_packet_t* pkt)
+{
+	const char* p = (const char*) pkt;
+	const char* q = p + sizeof(*pkt);
+	while (p < q)
+	{
+		int n = send(sock, p, q - p, 0);
+		if (n == -1)
+		{
+			switch (errno)
+			{
+			case EINTR:
+			case EAGAIN:
+				continue;
+			}
+			return APR_FROM_OS_ERROR(errno);
+		}
+		p += n;
+	}
+	return 0;
+}
+/* recv data through a sock */
+static apr_status_t recv_data(int sock, char* data, size_t data_size)
+{
+	char* p = data;
+	char* q = p + data_size;
+	while (p < q)
+	{
+		int n = recv(sock, p, q - p, 0);
+		if (n == -1)
+		{
+			// because we use blocking recv. if errno = EAGAIN,
+			// it means timeout happened.
+			if (errno == EINTR)
+				continue;
+			return APR_FROM_OS_ERROR(errno);
+		}
+		// because we use blocking recv, it indicates gpsmon
+		// shutdown or closed connection when n = 0.
+		if (n == 0)
+			return APR_FROM_OS_ERROR(EINTR);
+		p += n;
+	}
+
+	TR2(("read data from sock %d\n", sock));
+	return 0;
+}
+
+/* recv a packet thru sock */
+static apr_status_t recvpkt(int sock, gp_smon_to_mmon_packet_t* pkt, bool loop_until_all_recv)
+{
+	int e = 0;
+
+	//receive the header
+	if (0 != (e = recv_data(sock, (char *)&pkt->header, sizeof(gp_smon_to_mmon_header_t)))) {
+		return e;
+	}
+
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QEXEC)
+	{
+		// Get the data portion, then get the line
+		if (0 != (e = recv_data(sock, (char *)&pkt->u.qexec_packet.data, sizeof(qexec_packet_data_t))))
+		{
+			return e;
+		}
+	}
+	else
+	{
+		//receive the union packet
+		if (0 != (e = recv_data(sock, (char *)&pkt->u, get_size_by_pkttype_smon_to_mmon(pkt->header.pkttype)))) {
+			return e;
+		}
+	}
+	return 0;
+}
+
+
+static void getconfig(void)
+{
+	char *hostname = NULL;
+	char *master_data_directory = NULL;
+	char *standby_master_hostname = NULL;
+	int rc = 0;
+
+	static apr_pool_t *pool = NULL;
+
+	ax.port = perfmon_port;
+
+	if (pool == NULL)
+	{
+		if (APR_SUCCESS != (rc = apr_pool_create_alloc(&pool, NULL)))
+		{
+			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
+			gpmon_fatalx(FLINE, rc, "Failed to create APR pool\n");
+		}
+	}
+
+	/* fetch datadir */
+	gpdb_get_master_data_dir(&hostname, &master_data_directory, pool);
+	if (ax.master_data_directory == NULL)
+	{
+		if (master_data_directory == NULL) 
+		{
+			gpmon_fatalx(FLINE, rc, "Failed to create APR pool: failed to resolve master data directory\n");
+		}
+		ax.master_data_directory = strdup(master_data_directory);
+		CHECKMEM(ax.master_data_directory);
+	}
+
+	/* fetch standby master hostname */
+	gpdb_get_single_string_from_query("select hostname from gp_segment_configuration where content = -1 and role = 'm'", &standby_master_hostname, pool);
+	if (standby_master_hostname)
+	{
+		ax.standby_master_hostname = strdup(standby_master_hostname);
+		CHECKMEM(ax.standby_master_hostname);
+	}
+	else
+	{
+		ax.standby_master_hostname =  NULL;
+	}
+
+	/* clear pool for next call */
+	apr_pool_clear(pool);
+}
+
+/*
+ * Entrypoint of gpmmon
+ *
+ * Init hooks
+ * Define GUCs
+ * start gpmmon bgworker
+ */
+void 
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+	{
+		ereport(ERROR, (errmsg("gpmmon not in shared_preload_libraries")));
+	}
+	else
+	{
+		/* add version info */
+		ereport(INFO, (errmsg("booting gpmmon")));
+		print_version();
+	}
+
+	BackgroundWorker worker;
+	memset(&worker, 0, sizeof(BackgroundWorker));
+
+	def_gucs();
+	
+	/* start gpmmon only on coordinator */
+	if (!IS_QUERY_DISPATCHER())
+	{
+		return;
+	}
+
+	if (!perfmon_enabled)
+		return;
+	/* TODO: access database 'gpperfmon' directly rather than using libpq */
+	worker.bgw_flags      = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION;
+	worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	/* launcher process should be restarted after pm reset. */
+	worker.bgw_restart_time = BGW_DEFAULT_RESTART_INTERVAL;
+	snprintf(worker.bgw_library_name, BGW_MAXLEN, GPMMON_BINARY_NAME);
+	snprintf(worker.bgw_function_name, BGW_MAXLEN, "perfmon_main");
+	worker.bgw_notify_pid = 0;
+
+	snprintf(worker.bgw_name, BGW_MAXLEN, "[gpmmon]");
+
+	RegisterBackgroundWorker(&worker);
+}
+
+void
+_PG_fini(void)
+{}
+
+static void
+def_gucs(void)
+{
+	DefineCustomIntVariable("perfmon.port", "Sets the port number of perfmon.", NULL, &perfmon_port,
+			8888, 1024, 65535, PGC_POSTMASTER, 0, NULL, NULL, NULL);
+
+	DefineCustomBoolVariable("perfmon.enable", "Enable perfmon monitoring.", NULL,
+			&perfmon_enabled, false, PGC_POSTMASTER, 0, NULL, NULL, NULL);
+}
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
new file mode 100644
index 00000000000..51ccc087750
--- /dev/null
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -0,0 +1,1556 @@
+#undef GP_VERSION
+#include "postgres_fe.h"
+
+#include "apr_general.h"
+#include "apr_hash.h"
+#include "apr_time.h"
+#include "apr_queue.h"
+#include "apr_strings.h"
+#include "gpmon.h"
+#include "gpmondb.h"
+#include "gpmon_agg.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <math.h>
+#include <sys/stat.h>
+
+typedef enum disk_space_message_t
+{
+	DISK_SPACE_NO_MESSAGE_SENT = 0,
+	DISK_SPACE_WARNING_SENT ,
+	DISK_SPACE_ERROR_SENT
+} disk_space_message_t;
+
+typedef struct mmon_fsinfo_t
+{
+	gpmon_fsinfokey_t key;
+
+	apr_int64_t				bytes_used;
+	apr_int64_t				bytes_available;
+	apr_int64_t				bytes_total;
+	disk_space_message_t	sent_error_flag;
+	time_t					last_update_timestamp;
+} mmon_fsinfo_t; //the fsinfo structure used in mmon
+
+typedef struct mmon_qexec_t
+{
+	gpmon_qexeckey_t 	key;
+	apr_uint64_t 		rowsout;
+	apr_uint64_t		_cpu_elapsed; /* CPU elapsed for iter */
+	apr_uint64_t 		measures_rows_in;
+} mmon_qexec_t;  //The qexec structure used in mmon
+
+typedef struct mmon_query_seginfo_t
+{
+	gpmon_query_seginfo_key_t	key;
+	apr_int64_t					final_rowsout;
+	apr_uint64_t				sum_cpu_elapsed;
+	apr_uint64_t				sum_measures_rows_out;
+} mmon_query_seginfo_t;  //The agg value at segment level for query
+
+typedef struct qdnode_t {
+	apr_int64_t last_updated_generation;
+	int recorded;
+	int num_metrics_packets;
+	gpmon_qlog_t qlog;
+	apr_hash_t* qexec_hash;
+	apr_hash_t*	query_seginfo_hash;
+} qdnode_t;
+
+struct agg_t
+{
+	apr_int64_t generation;
+	apr_pool_t* pool;
+	apr_pool_t* parent_pool;
+	apr_hash_t* qtab;		/* key = gpmon_qlog_key_t, value = qdnode ptr. */
+	apr_hash_t* htab;		/* key = hostname, value = gpmon_metrics_t ptr */
+	apr_hash_t* stab;		/* key = databaseid, value = gpmon_seginfo_t ptr */
+	apr_hash_t* fsinfotab;	/* This is the persistent fsinfo hash table: key = gpmon_fsinfokey_t, value = mmon_fsinfo_t ptr */
+};
+
+typedef struct dbmetrics_t {
+	apr_int32_t queries_total;
+	apr_int32_t queries_running;
+	apr_int32_t queries_queued;
+} dbmetrics_t;
+
+extern int min_query_time;
+extern mmon_options_t opt;
+extern apr_queue_t* message_queue;
+
+extern void incremement_tail_bytes(apr_uint64_t bytes);
+static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid,
+			apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool);
+
+/**
+ * Disk space check helper function
+ * Note- trys to push a message on a queue so that the message thread can send the message
+ */
+/* gp_elog has been removed */
+/*
+static apr_status_t  check_disk_space(mmon_fsinfo_t* rec)
+{
+	static time_t interval_start_time = 0;
+	static unsigned int number_messages_sent_this_interval = 0;
+	time_t now = 0;
+	int used_disk_space_percent =  ROUND_DIVIDE((rec->bytes_used *100),rec->bytes_total);
+
+	now = time(NULL);
+	// reset the interval if needed
+	if ((now - interval_start_time) >= opt.disk_space_interval){
+		interval_start_time = now;
+		number_messages_sent_this_interval = 0;
+	}
+
+	// Check the disk space if we haven't already sent an error
+	if (rec->sent_error_flag != DISK_SPACE_ERROR_SENT) {
+		disk_space_message_t send_flag = DISK_SPACE_NO_MESSAGE_SENT;
+		char* message = 0;
+
+		// check for errors and then warnings
+		if ((opt.error_disk_space_percentage != 0) && (used_disk_space_percent >= opt.error_disk_space_percentage)) {
+			//Send an error if the error_disk_space_percentage threshold is set and the used_disk_space_percent is greater or equal to it
+			send_flag = DISK_SPACE_ERROR_SENT;
+			message = "ERROR";
+		} else if ((rec->sent_error_flag != DISK_SPACE_WARNING_SENT) && (opt.warning_disk_space_percentage != 0 ) &&
+					(used_disk_space_percent >= opt.warning_disk_space_percentage)) {
+			//Send warning if the warning_disk_space_percentage threshold is set and the used_disk_space_percent is greater or equal to it
+			//and if a warning has not already been sent
+			send_flag = DISK_SPACE_WARNING_SENT;
+			message = "WARNING";
+		} else if ((rec->sent_error_flag == DISK_SPACE_WARNING_SENT) && (used_disk_space_percent < opt.warning_disk_space_percentage)) {
+			//if a warning as been sent and the used disk has fallen below the below the warning threshold reset the send flag
+			rec->sent_error_flag = DISK_SPACE_NO_MESSAGE_SENT;
+		}
+
+		// Send a warning or error if needed by putting the message in a queue
+		if (send_flag != DISK_SPACE_NO_MESSAGE_SENT){
+			//only sent the message if
+			if (number_messages_sent_this_interval < opt.max_disk_space_messages_per_interval) {
+				char *query;
+				apr_status_t status;
+				unsigned int query_size_max = NAMEDATALEN + GPMON_FSINFO_MAX_PATH + 200;
+
+				query = malloc(query_size_max);
+				if (!query) {
+					TR0(("check_disk_space ERROR: malloc(%d) returned NULL, out of memory\n", query_size_max));
+					return APR_ENOMEM;
+				}
+				snprintf(query, query_size_max, "select gp_elog('%s: percent used disk space for %s %s is %d%%', True)",
+						message, rec->key.hostname, rec->key.fsname, used_disk_space_percent);
+
+				status = apr_queue_trypush(message_queue, (void *) query);
+				if (status == APR_EINTR) { //blocking interrupted try one more time
+					status = apr_queue_trypush(message_queue, (void *) query);
+				}
+				if (status != APR_SUCCESS) {
+					TR0(("check_disk_space ERROR: apr_queue_trypush returned %d; cannot send %s\n", status, query));
+					free(query);
+				} else {
+					number_messages_sent_this_interval++;
+				}
+
+			} else {
+				TR1(("check_disk_space: message max reached: Not sending message for %s %s. used_disk_space_percent = %d%%\n", rec->key.hostname, rec->key.fsname, used_disk_space_percent));
+			}
+
+			rec->sent_error_flag = send_flag;
+		}
+
+	} else if ( ( opt.warning_disk_space_percentage != 0 ) && ( used_disk_space_percent < opt.warning_disk_space_percentage )) {
+		//if there is a warning percent to check and the used disk has fallen below the below the warning threshold reset the send flag
+		rec->sent_error_flag = DISK_SPACE_NO_MESSAGE_SENT;
+	} else if ( ( opt.warning_disk_space_percentage == 0 ) && ( used_disk_space_percent < opt.error_disk_space_percentage )) {
+		//if there is no warning percent to check and the used disk has fallen below the below the error threshold reset the send flag
+		rec->sent_error_flag = DISK_SPACE_NO_MESSAGE_SENT;
+	}
+	return 0;
+}
+*/
+
+static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool)
+{
+	// get active query of session
+	char *key = apr_psprintf(pool, "%d", ssid);
+	char *active_query = apr_hash_get(hash, key, APR_HASH_KEY_STRING);
+	if (active_query == NULL)
+	{
+		TR0(("Found orphan query, tmid:%d, ssid:%d, ccnt:%d\n", tmid, ssid, ccnt));
+		return true;
+	}
+
+	// read query text from q file
+	char *query = get_query_text(tmid, ssid, ccnt, pool);
+	if (query == NULL)
+	{
+		TR0(("Found error while reading query text in file '%sq%d-%d-%d.txt'\n", GPMON_DIR, tmid, ssid, ccnt));
+		return true;
+	}
+	// if the current active query of session (ssid) is not the same
+	// as the one we are checking, we assume q(tmid)-(ssid)-(ccnt).txt
+	// has wrong status. This is a bug in execMain.c, which too hard to
+	// fix it there.
+	int qlen = strlen(active_query);
+	if (qlen > MAX_QUERY_COMPARE_LENGTH)
+	{
+		qlen = MAX_QUERY_COMPARE_LENGTH;
+	}
+	int res = strncmp(query, active_query, qlen);
+	if (res != 0)
+	{
+		TR0(("Found orphan query, tmid:%d, ssid:%d, ccnt:%d\n", tmid, ssid, ccnt));
+		return true;
+	}
+
+	return false;
+}
+
+static apr_status_t agg_put_fsinfo(agg_t* agg, const gpmon_fsinfo_t* met)
+{
+	mmon_fsinfo_t* rec;
+
+	rec = apr_hash_get(agg->fsinfotab, &met->key, sizeof(met->key));
+	if (!rec) {
+		// Use the parent pool because we need the fsinfo to be persistent and never be freed
+		rec = apr_palloc(agg->parent_pool, sizeof(*rec));
+		if (!rec)
+			return APR_ENOMEM;
+		rec->key = met->key;
+		rec->sent_error_flag = DISK_SPACE_NO_MESSAGE_SENT;
+		apr_hash_set(agg->fsinfotab, &met->key, sizeof(met->key), rec);
+	}
+	rec->bytes_available = met->bytes_available;
+	rec->bytes_total = met->bytes_total;
+	rec->bytes_used = met->bytes_used;
+	rec->last_update_timestamp = time(NULL); //set the updated timestamp for the packet
+
+	// if both the option percentages are set to 0 than the disk space check is disabled
+	// Also if max_disk_space_messages_per_interval is 0 the disk space check is disabled
+	//if (((opt.warning_disk_space_percentage) || (opt.error_disk_space_percentage)) &&
+	//		(opt.max_disk_space_messages_per_interval != 0)) {
+	//	check_disk_space(rec);
+	//}
+	return 0;
+}
+
+static apr_status_t agg_put_queryseg(agg_t* agg, const gpmon_query_seginfo_t* met, apr_int64_t generation)
+{
+	qdnode_t* dp;
+	gpmon_qlogkey_t key;
+	mmon_query_seginfo_t* rec = 0;
+
+	/* find qdnode of this qexec */
+	key.tmid = met->key.qkey.tmid;
+	key.ssid = met->key.qkey.ssid;
+	key.ccnt = met->key.qkey.ccnt;
+	dp = apr_hash_get(agg->qtab, &key, sizeof(key));
+
+	if (!dp) { /* not found, internal SPI query.  Ignore. */
+		return 0;
+	}
+	rec = apr_hash_get(dp->query_seginfo_hash, &met->key.segid, sizeof(met->key.segid));
+
+	/* if found, replace it */
+	if (rec) {
+		rec->final_rowsout = met->final_rowsout;
+		rec->sum_cpu_elapsed += met->sum_cpu_elapsed;
+		rec->sum_measures_rows_out += met->sum_measures_rows_out;
+	}
+	else {
+		/* not found, make new hash entry */
+
+		if (!(rec = apr_palloc(agg->pool, sizeof(mmon_query_seginfo_t)))){
+
+			return APR_ENOMEM;
+		}
+		memcpy(&rec->key, &met->key, sizeof(gpmon_query_seginfo_key_t));
+		rec->final_rowsout = met->final_rowsout;
+		rec->sum_cpu_elapsed = met->sum_cpu_elapsed;
+		rec->sum_measures_rows_out = met->sum_measures_rows_out;
+
+		apr_hash_set(dp->query_seginfo_hash, &rec->key.segid, sizeof(rec->key.segid), rec);
+	}
+
+	dp->last_updated_generation = generation;
+	return 0;
+}
+
+static apr_status_t agg_put_metrics(agg_t* agg, const gpmon_metrics_t* met)
+{
+	gpmon_metrics_t* rec;
+
+	rec = apr_hash_get(agg->htab, met->hname, APR_HASH_KEY_STRING);
+	if (rec) {
+		*rec = *met;
+	} else {
+		rec = apr_palloc(agg->pool, sizeof(*rec));
+		if (!rec)
+			return APR_ENOMEM;
+		*rec = *met;
+		apr_hash_set(agg->htab, rec->hname, APR_HASH_KEY_STRING, rec);
+	}
+	return 0;
+}
+
+static apr_status_t agg_put_segment(agg_t* agg, const gpmon_seginfo_t* seg)
+{
+	gpmon_seginfo_t* rec;
+
+	rec = apr_hash_get(agg->stab, &seg->dbid, sizeof(seg->dbid));
+	if (rec)
+	{
+		*rec = *seg;
+	}
+	else
+	{
+		rec = apr_palloc(agg->pool, sizeof(*rec));
+		if (!rec)
+		{
+			return APR_ENOMEM;
+		}
+		*rec = *seg;
+		apr_hash_set(agg->stab, &rec->dbid, sizeof(rec->dbid), rec);
+	}
+	return 0;
+}
+
+static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation)
+{
+	qdnode_t* node;
+
+	node = apr_hash_get(agg->qtab, &qlog->key, sizeof(qlog->key));
+	if (!node) {
+		gpmon_qlogkey_t new_key = qlog->key;
+		new_key.ccnt = 0;
+		node = apr_hash_get(agg->qtab, &new_key, sizeof(new_key));
+	}
+	if (node)
+	{
+		// here update the stats for the query
+		node->qlog.cpu_elapsed += qlog->cpu_elapsed;
+		node->qlog.p_metrics.cpu_pct += qlog->p_metrics.cpu_pct;
+		node->last_updated_generation = generation;
+		node->num_metrics_packets++;
+		TR2(("Query Metrics: (host %s ssid %d ccnt %d) (cpuelapsed %d cpupct %f) / %d\n",
+			 qlog->user, qlog->key.ssid, qlog->key.ccnt, (int) node->qlog.cpu_elapsed, node->qlog.p_metrics.cpu_pct,
+			node->num_metrics_packets));
+	}
+	return 0;
+}
+
+static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
+				 apr_int64_t generation)
+{
+	qdnode_t* node;
+
+	node = apr_hash_get(agg->qtab, &qlog->key, sizeof(qlog->key));
+	if (node) {
+		//node->qlog = *qlog;
+		merge_qlog(&node->qlog, qlog);
+		if (qlog->dbid != gpperfmon_dbid) {
+			TR2(("agg_put_qlog: found %d.%d.%d generation %d recorded %d\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, (int) generation, node->recorded));
+		}
+	} else {
+		node = apr_pcalloc(agg->pool, sizeof(*node));
+		if (!node)
+			return APR_ENOMEM;
+
+		node->qlog = *qlog;
+		node->recorded = 0;
+		node->qlog.cpu_elapsed = 0;
+		node->qlog.p_metrics.cpu_pct = 0.0;
+		node->num_metrics_packets = 0;
+
+		node->qexec_hash = apr_hash_make(agg->pool);
+		if (!node->qexec_hash) {
+			TR2(("agg_put_qlog: qexec_hash = apr_hash_make(agg->pool) returned null\n"));
+			return APR_ENOMEM;
+		}
+
+		node->query_seginfo_hash = apr_hash_make(agg->pool);
+		if (!node->query_seginfo_hash) {
+			TR2(("agg_put_qlog: query_seginfo_hash = apr_hash_make(agg->pool) returned null\n"));
+			return APR_ENOMEM;
+		}
+
+		apr_hash_set(agg->qtab, &node->qlog.key, sizeof(node->qlog.key), node);
+		if (qlog->dbid != gpperfmon_dbid) {
+			TR2(("agg_put: new %d.%d.%d generation %d recorded %d\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, (int) generation, node->recorded));
+		}
+	}
+	node->last_updated_generation = generation;
+
+	return 0;
+}
+
+
+static apr_status_t agg_put_qexec(agg_t* agg, const qexec_packet_t* qexec_packet, apr_int64_t generation)
+{
+	qdnode_t* dp;
+	gpmon_qlogkey_t key;
+	mmon_qexec_t* mmon_qexec_existing = 0;
+
+	/* find qdnode of this qexec */
+	key.tmid = qexec_packet->data.key.tmid;
+	key.ssid = qexec_packet->data.key.ssid;
+	key.ccnt = qexec_packet->data.key.ccnt;
+	dp = apr_hash_get(agg->qtab, &key, sizeof(key));
+
+	if (!dp) { /* not found, internal SPI query.  Ignore. */
+		return 0;
+	}
+
+	mmon_qexec_existing = apr_hash_get(dp->qexec_hash, &qexec_packet->data.key.hash_key, sizeof(qexec_packet->data.key.hash_key));
+
+	/* if found, replace it */
+	if (mmon_qexec_existing) {
+		mmon_qexec_existing->key.ccnt = qexec_packet->data.key.ccnt;
+		mmon_qexec_existing->key.ssid = qexec_packet->data.key.ssid;
+		mmon_qexec_existing->key.tmid = qexec_packet->data.key.tmid;
+		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
+		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
+		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
+	}
+	else {
+		/* not found, make new hash entry */
+		if (! (mmon_qexec_existing = apr_palloc(agg->pool, sizeof(mmon_qexec_t))))
+			return APR_ENOMEM;		
+
+		memcpy(&mmon_qexec_existing->key, &qexec_packet->data.key, sizeof(gpmon_qexeckey_t));
+		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
+		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
+		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
+		apr_hash_set(dp->qexec_hash, &mmon_qexec_existing->key.hash_key, sizeof(mmon_qexec_existing->key.hash_key), mmon_qexec_existing);
+	}
+
+	dp->last_updated_generation = generation;
+	return 0;
+}
+
+
+apr_status_t agg_create(agg_t** retagg, apr_int64_t generation, apr_pool_t* parent_pool, apr_hash_t* fsinfotab)
+{
+	int e;
+	apr_pool_t* pool;
+	agg_t* agg;
+
+	if (0 != (e = apr_pool_create_alloc(&pool, parent_pool)))
+		return e;
+
+	agg = apr_pcalloc(pool, sizeof(*agg));
+	if (!agg) {
+		apr_pool_destroy(pool);
+		return APR_ENOMEM;
+	}
+
+	agg->generation = generation;
+	agg->pool = pool;
+	agg->parent_pool = parent_pool;
+	agg->fsinfotab = fsinfotab; // This hash table for the fsinfo is persistent and will use the parent pool
+
+	agg->qtab = apr_hash_make(pool);
+	if (!agg->qtab) {
+		apr_pool_destroy(pool);
+		return APR_ENOMEM;
+	}
+
+	agg->htab = apr_hash_make(pool);
+	if (!agg->htab) {
+		apr_pool_destroy(pool);
+		return APR_ENOMEM;
+	}
+
+	agg->stab = apr_hash_make(pool);
+	if (!agg->stab) {
+		apr_pool_destroy(pool);
+		return APR_ENOMEM;
+	}
+
+	*retagg = agg;
+	return 0;
+}
+
+
+
+apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr_hash_t* fsinfotab)
+{
+	int e, cnt;
+	agg_t* newagg;
+	apr_hash_index_t *hi, *hj;
+
+	if (0 != (e = agg_create(&newagg, oldagg->generation + 1, parent_pool, fsinfotab)))
+	{
+		return e;
+	}
+
+	apr_hash_t *active_query_tab = get_active_queries(newagg->pool);
+	if (! active_query_tab)
+	{
+		agg_destroy(newagg);
+		return APR_EINVAL;
+	}
+
+	for (hi = apr_hash_first(0, oldagg->qtab); hi; hi = apr_hash_next(hi))
+	{
+		void* vptr;
+		qdnode_t* dp;
+		qdnode_t* newdp;
+		apr_int32_t status;
+
+		apr_hash_this(hi, 0, 0, &vptr);
+		dp = vptr;
+
+		/* skip all entries that weren't updated recently and aren't waiting in a queue */
+		/* Read status from query text as this is reliable */
+		status = get_query_status(dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt);
+
+		apr_int32_t age = newagg->generation - dp->last_updated_generation - 1;
+		if (age > 0)
+		{
+			if (  (status != GPMON_QLOG_STATUS_SUBMIT
+			       && status != GPMON_QLOG_STATUS_CANCELING
+			       && status != GPMON_QLOG_STATUS_START)
+			   || ((age % 5 == 0) /* don't call is_query_not_active every time because it's expensive */
+			       && is_query_not_active(dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt, active_query_tab, newagg->pool)))
+			{
+				if (dp->qlog.dbid != gpperfmon_dbid)
+				{
+					TR2(("agg_dup: skip %d.%d.%d generation %d, current generation %d, recorded %d\n",
+						dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt,
+						(int) dp->last_updated_generation, (int) newagg->generation, dp->recorded));
+				}
+				continue;
+			}
+		}
+
+		/* check if we missed a status change */
+		if (dp->qlog.status != status)
+			dp->qlog.status = status;
+
+		if (dp->qlog.dbid != gpperfmon_dbid) {
+			TR2( ("agg_dup: add %d.%d.%d, generation %d, recorded %d:\n", dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt, (int) dp->last_updated_generation, dp->recorded));
+		}
+
+		/* dup this entry */
+		if (!(newdp = apr_palloc(newagg->pool, sizeof(*newdp)))) {
+			agg_destroy(newagg);
+			return APR_ENOMEM;
+		}
+
+		*newdp = *dp;
+
+		newdp->qexec_hash = apr_hash_make(newagg->pool);
+		if (!newdp->qexec_hash) {
+			agg_destroy(newagg);
+			return APR_ENOMEM;
+		}
+
+		cnt = 0;
+		// Copy the qexec hash table
+		for (hj = apr_hash_first(newagg->pool, dp->qexec_hash); hj; hj = apr_hash_next(hj)) {
+			mmon_qexec_t* new_qexec;
+			apr_hash_this(hj, 0, 0, &vptr);
+
+			//allocate the packet
+			if (!(new_qexec = apr_pcalloc(newagg->pool, sizeof(mmon_qexec_t)))) {
+				agg_destroy(newagg);
+				return APR_ENOMEM;
+			}
+			*new_qexec = *((mmon_qexec_t*)vptr);
+
+			apr_hash_set(newdp->qexec_hash, &(new_qexec->key.hash_key), sizeof(new_qexec->key.hash_key), new_qexec);
+			TR2( ("\t    %d: (%d, %d)\n", ++cnt, new_qexec->key.hash_key.segid, new_qexec->key.hash_key.nid));
+		}
+
+		newdp->query_seginfo_hash = apr_hash_make(newagg->pool);
+		if (!newdp->query_seginfo_hash) {
+			agg_destroy(newagg);
+			return APR_ENOMEM;
+		}
+
+		cnt = 0;
+		// Copy the query_seginfo hash table
+		for (hj = apr_hash_first(newagg->pool, dp->query_seginfo_hash); hj; hj = apr_hash_next(hj)) {
+			mmon_query_seginfo_t* new_query_seginfo;
+			apr_hash_this(hj, 0, 0, &vptr);
+
+			if (!(new_query_seginfo = apr_pcalloc(newagg->pool, sizeof(mmon_query_seginfo_t)))) {
+				agg_destroy(newagg);
+				return APR_ENOMEM;
+			}
+			*new_query_seginfo = *((mmon_query_seginfo_t*)vptr);
+
+			apr_hash_set(newdp->query_seginfo_hash, &(new_query_seginfo->key.segid), sizeof(new_query_seginfo->key.segid), new_query_seginfo);
+			TR2( ("\t    %d: (%d)\n", ++cnt, new_query_seginfo->key.segid));
+		}
+
+		// reset metrics that are accumulated each quantum
+		newdp->qlog.cpu_elapsed = 0;
+		newdp->qlog.p_metrics.cpu_pct = 0.0;
+		newdp->num_metrics_packets = 0;
+
+		apr_hash_set(newagg->qtab, &newdp->qlog.key, sizeof(newdp->qlog.key), newdp);
+	}
+
+	*retagg = newagg;
+	return 0;
+}
+
+void agg_destroy(agg_t* agg)
+{
+	apr_pool_destroy(agg->pool);
+}
+
+apr_status_t agg_put(agg_t* agg, const gp_smon_to_mmon_packet_t* pkt)
+{
+	if (pkt->header.pkttype == GPMON_PKTTYPE_METRICS)
+		return agg_put_metrics(agg, &pkt->u.metrics);
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QLOG)
+		return agg_put_qlog(agg, &pkt->u.qlog, agg->generation);
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QEXEC)
+		return agg_put_qexec(agg, &pkt->u.qexec_packet, agg->generation);
+	if (pkt->header.pkttype == GPMON_PKTTYPE_SEGINFO)
+		return agg_put_segment(agg, &pkt->u.seginfo);
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QUERY_HOST_METRICS)
+		return agg_put_query_metrics(agg, &pkt->u.qlog, agg->generation);
+	if (pkt->header.pkttype == GPMON_PKTTYPE_FSINFO)
+		return agg_put_fsinfo(agg, &pkt->u.fsinfo);
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QUERYSEG)
+		return agg_put_queryseg(agg, &pkt->u.queryseg, agg->generation);
+
+	gpmon_warning(FLINE, "unknown packet type %d", pkt->header.pkttype);
+	return 0;
+}
+
+
+typedef struct bloom_t bloom_t;
+struct bloom_t {
+	unsigned char map[1024];
+};
+static void bloom_init(bloom_t* bloom);
+static void bloom_set(bloom_t* bloom, const char* name);
+static int  bloom_isset(bloom_t* bloom, const char* name);
+static void delete_old_files(bloom_t* bloom);
+static apr_uint32_t write_fsinfo(agg_t* agg, const char* nowstr);
+static apr_uint32_t write_system(agg_t* agg, const char* nowstr);
+static apr_uint32_t write_segmentinfo(agg_t* agg, char* nowstr);
+static apr_uint32_t write_dbmetrics(dbmetrics_t* dbmetrics, char* nowstr);
+static apr_uint32_t write_qlog(FILE* fp, qdnode_t *qdnode, const char* nowstr, apr_uint32_t done);
+static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nowstr);
+
+apr_status_t agg_dump(agg_t* agg)
+{
+	apr_hash_index_t *hi;
+	bloom_t bloom;
+	char nowstr[GPMON_DATE_BUF_SIZE];
+	FILE* fp_queries_now = 0;
+	FILE* fp_queries_tail = 0;
+
+	dbmetrics_t dbmetrics = {0};
+
+	apr_uint32_t temp_bytes_written = 0;
+
+	gpmon_datetime_rounded(time(NULL), nowstr);
+
+	bloom_init(&bloom);
+
+	/* we never delete system_tail/ system_now/
+		queries_tail/ queries_now/ files */
+	bloom_set(&bloom, GPMON_DIR "system_now.dat");
+	bloom_set(&bloom, GPMON_DIR "system_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "system_stage.dat");
+	bloom_set(&bloom, GPMON_DIR "_system_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "queries_now.dat");
+	bloom_set(&bloom, GPMON_DIR "queries_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "queries_stage.dat");
+	bloom_set(&bloom, GPMON_DIR "_queries_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "database_now.dat");
+	bloom_set(&bloom, GPMON_DIR "database_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "database_stage.dat");
+	bloom_set(&bloom, GPMON_DIR "_database_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "segment_now.dat");
+	bloom_set(&bloom, GPMON_DIR "segment_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "segment_stage.dat");
+	bloom_set(&bloom, GPMON_DIR "_segment_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "diskspace_now.dat");
+	bloom_set(&bloom, GPMON_DIR "diskspace_tail.dat");
+	bloom_set(&bloom, GPMON_DIR "diskspace_stage.dat");
+	bloom_set(&bloom, GPMON_DIR "_diskspace_tail.dat");
+
+
+	/* dump metrics */
+	temp_bytes_written = write_system(agg, nowstr);
+	incremement_tail_bytes(temp_bytes_written);
+
+	/* write segment metrics */
+	temp_bytes_written = write_segmentinfo(agg, nowstr);
+	incremement_tail_bytes(temp_bytes_written);
+
+	/* write fsinfo metrics */
+	temp_bytes_written = write_fsinfo(agg, nowstr);
+	incremement_tail_bytes(temp_bytes_written);
+
+	if (! (fp_queries_tail = fopen(GPMON_DIR "queries_tail.dat", "a")))
+		return APR_FROM_OS_ERROR(errno);
+
+	/* loop through queries */
+	for (hi = apr_hash_first(0, agg->qtab); hi; hi = apr_hash_next(hi))
+	{
+		void* vptr;
+		qdnode_t* qdnode;
+		apr_hash_this(hi, 0, 0, &vptr);
+		qdnode = vptr;
+
+		if (qdnode->qlog.status == GPMON_QLOG_STATUS_DONE || qdnode->qlog.status == GPMON_QLOG_STATUS_ERROR)
+		{
+			if (!qdnode->recorded && ((qdnode->qlog.tfin - qdnode->qlog.tstart) >= min_query_time))
+			{
+				TR1(("queries_tail: %p add query %d.%d.%d, status %d, generation %d, recorded %d\n",
+					 agg->qtab, qdnode->qlog.key.tmid, qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt, qdnode->qlog.status, (int) qdnode->last_updated_generation, qdnode->recorded));
+
+				temp_bytes_written += write_qlog_full(fp_queries_tail, qdnode, nowstr);
+				incremement_tail_bytes(temp_bytes_written);
+
+				qdnode->recorded = 1;
+			}
+		}
+		else
+		{
+			switch (qdnode->qlog.status)
+			{
+			case GPMON_QLOG_STATUS_START:
+			case GPMON_QLOG_STATUS_CANCELING:
+				dbmetrics.queries_running++;
+				break;
+			case GPMON_QLOG_STATUS_SUBMIT:
+				dbmetrics.queries_queued++;
+				break;
+			default:
+				/* Not interested */
+				break;
+			}
+		}
+	}
+	dbmetrics.queries_total = dbmetrics.queries_running + dbmetrics.queries_queued;
+
+	fclose(fp_queries_tail);
+	fp_queries_tail = 0;
+
+	/* dump dbmetrics */
+	temp_bytes_written += write_dbmetrics(&dbmetrics, nowstr);
+	incremement_tail_bytes(temp_bytes_written);
+
+	if (! (fp_queries_now = fopen(GPMON_DIR "_queries_now.dat", "w")))
+		return APR_FROM_OS_ERROR(errno);
+
+	for (hi = apr_hash_first(0, agg->qtab); hi; hi = apr_hash_next(hi))
+	{
+		void* vptr;
+		qdnode_t* qdnode;
+
+		apr_hash_this(hi, 0, 0, &vptr);
+		qdnode = vptr;
+
+		/* don't touch this file */
+		{
+			const int fname_size = sizeof(GPMON_DIR) + 100;
+			char fname[fname_size];
+			snprintf(fname, fname_size, GPMON_DIR "q%d-%d-%d.txt",
+			qdnode->qlog.key.tmid, qdnode->qlog.key.ssid,
+			qdnode->qlog.key.ccnt);
+
+			bloom_set(&bloom, fname);
+		}
+
+		/* write to _query_now.dat */
+		if (qdnode->qlog.status != GPMON_QLOG_STATUS_DONE && qdnode->qlog.status != GPMON_QLOG_STATUS_ERROR)
+		{
+			write_qlog(fp_queries_now, qdnode, nowstr, 0);
+		}
+		else if (qdnode->qlog.tfin - qdnode->qlog.tstart >= min_query_time)
+		{
+			write_qlog(fp_queries_now, qdnode, nowstr, 1);
+		}
+
+	}
+
+	if (fp_queries_now) fclose(fp_queries_now);
+	if (fp_queries_tail) fclose(fp_queries_tail);
+	rename(GPMON_DIR "_system_now.dat", GPMON_DIR "system_now.dat");
+	rename(GPMON_DIR "_segment_now.dat", GPMON_DIR "segment_now.dat");
+	rename(GPMON_DIR "_queries_now.dat", GPMON_DIR "queries_now.dat");
+	rename(GPMON_DIR "_database_now.dat", GPMON_DIR "database_now.dat");
+	rename(GPMON_DIR "_diskspace_now.dat", GPMON_DIR "diskspace_now.dat");
+
+	/* clean up ... delete all old files by checking our bloom filter */
+	delete_old_files(&bloom);
+
+	return 0;
+}
+
+extern int gpmmon_quantum(void);
+
+static void delete_old_files(bloom_t* bloom)
+{
+	char findDir[256] = {0};
+	char findCmd[512] = {0};
+	FILE* fp = NULL;
+	time_t cutoff = time(0) - gpmmon_quantum() * 3;
+
+	/* Need to remove trailing / in dir so find results are consistent
+     * between platforms
+     */
+	strncpy(findDir, GPMON_DIR, 255);
+	if (findDir[strlen(findDir) -1] == '/')
+		findDir[strlen(findDir) - 1] = '\0';
+
+	snprintf(findCmd, 512, "find %s -name \"q*-*.txt\" 2> /dev/null", findDir);
+	fp = popen(findCmd, "r");
+
+	if (fp)
+	{
+		for (;;)
+		{
+			char line[1024];
+			char* p;
+			struct stat stbuf;
+			apr_int32_t status;
+
+			line[sizeof(line) - 1] = 0;
+			if (! (p = fgets(line, sizeof(line), fp)))
+				break;
+			if (line[sizeof(line) - 1])
+				continue; 	/* fname too long */
+
+			p = gpmon_trim(p);
+			TR2(("Checking file %s\n", p));
+
+			if (0 == stat(p, &stbuf))
+			{
+#if defined(linux)
+				int expired = stbuf.st_mtime < cutoff;
+#else
+				int expired = stbuf.st_mtimespec.tv_sec < cutoff;
+#endif
+				TR2(("File %s expired: %d\n", p, expired));
+				if (expired)
+				{
+					apr_int32_t tmid = 0, ssid = 0, ccnt = 0;
+					if (bloom_isset(bloom, p))
+					{
+						TR2(("File %s has bloom set.  Checking status\n", p));
+						/* Verify no bloom collision */
+						sscanf(p, GPMON_DIR "q%d-%d-%d.txt", &tmid, &ssid, &ccnt);
+						TR2(("tmid: %d, ssid: %d, ccnt: %d\n", tmid, ssid, ccnt));
+						status = get_query_status(tmid, ssid, ccnt);
+						TR2(("File %s has status of %d\n", p, status));
+						if (status == GPMON_QLOG_STATUS_DONE ||
+						   status == GPMON_QLOG_STATUS_ERROR)
+						{
+							TR2(("Deleting file %s\n", p));
+							unlink(p);
+						}
+					}
+					else
+					{
+						TR2(("Deleting file %s\n", p));
+						unlink(p);
+					}
+				}
+			}
+		}
+		pclose(fp);
+	}
+	else
+	{
+		gpmon_warning(FLINE, "Failed to get a list of query text files.\n");
+	}
+}
+
+static apr_uint32_t write_segmentinfo(agg_t* agg, char* nowstr)
+{
+	FILE* fp = fopen(GPMON_DIR "segment_tail.dat", "a");
+	FILE* fp2 = fopen(GPMON_DIR "_segment_now.dat", "w");
+	apr_hash_index_t* hi;
+	const int line_size = 256;
+	char line[line_size];
+	apr_uint32_t bytes_written = 0;
+
+	if (!fp || !fp2)
+	{
+		if (fp) fclose(fp);
+		if (fp2) fclose(fp2);
+		return 0;
+	}
+
+	for (hi = apr_hash_first(0, agg->stab); hi; hi = apr_hash_next(hi))
+	{
+		gpmon_seginfo_t* sp;
+		int bytes_this_record;
+		void* valptr = 0;
+		apr_hash_this(hi, 0, 0, (void**) &valptr);
+		sp = (gpmon_seginfo_t*) valptr;
+
+		snprintf(line, line_size, "%s|%d|%s|%" FMTU64 "|%" FMTU64, nowstr, sp->dbid, sp->hostname, sp->dynamic_memory_used, sp->dynamic_memory_available);
+
+		bytes_this_record = strlen(line) + 1;
+		if (bytes_this_record == line_size)
+		{
+			gpmon_warning(FLINE, "segmentinfo line to too long ... ignored: %s", line);
+			continue;
+		}
+		fprintf(fp, "%s\n", line);
+		fprintf(fp2, "%s\n", line);
+		bytes_written += bytes_this_record;
+    }
+
+	fclose(fp);
+	fclose(fp2);
+	return bytes_written;
+}
+
+static apr_uint32_t write_fsinfo(agg_t* agg, const char* nowstr)
+{
+	FILE* fp = fopen(GPMON_DIR "diskspace_tail.dat", "a");
+	FILE* fp2 = fopen(GPMON_DIR "_diskspace_now.dat", "w");
+	apr_hash_index_t* hi;
+	const int line_size = 512;
+	char line[line_size];
+	apr_uint32_t bytes_written = 0;
+	static time_t last_time_fsinfo_written = 0;
+
+	if (!fp || !fp2)
+	{
+		if (fp) fclose(fp);
+		if (fp2) fclose(fp2);
+		return 0;
+	}
+
+	for (hi = apr_hash_first(0, agg->fsinfotab); hi; hi = apr_hash_next(hi))
+	{
+		mmon_fsinfo_t* fsp;
+		void* valptr = 0;
+		int bytes_this_line;
+
+		apr_hash_this(hi, 0, 0, (void**) &valptr);
+		fsp = (mmon_fsinfo_t*) valptr;
+
+		// We only want to write the fsinfo for packets that have been updated since the last time we wrote
+		// the fsinfo, so skip the fsinfo if its timestamp is less than the last time written timestamp
+		if (fsp->last_update_timestamp < last_time_fsinfo_written) {
+			continue;
+		}
+
+		snprintf(line, line_size, "%s|%s|%s|%" FMT64 "|%" FMT64 "|%" FMT64,
+				nowstr,
+				fsp->key.hostname,
+				fsp->key.fsname,
+				fsp->bytes_total,
+				fsp->bytes_used,
+				fsp->bytes_available);
+
+		TR2(("write_fsinfo(): writing %s\n", line));
+		bytes_this_line = strlen(line) + 1;
+		if (bytes_this_line == line_size){
+			gpmon_warning(FLINE, "fsinfo metrics line too long ... ignored: %s", line);
+			continue;
+		}
+
+		fprintf(fp, "%s\n", line);
+		fprintf(fp2, "%s\n", line);
+
+		bytes_written += bytes_this_line;
+	}
+
+	fclose(fp);
+	fclose(fp2);
+
+	last_time_fsinfo_written = time(NULL); //set the static time written variable
+
+	return bytes_written;
+}
+
+static apr_uint32_t write_dbmetrics(dbmetrics_t* dbmetrics, char* nowstr)
+{
+	FILE* fp = fopen(GPMON_DIR "database_tail.dat", "a");
+	FILE* fp2 = fopen(GPMON_DIR "_database_now.dat", "w");
+	int e;
+	const int line_size = 256;
+	char line[line_size];
+	int bytes_written;
+
+	if (!fp || !fp2)
+	{
+		e = APR_FROM_OS_ERROR(errno);
+		if (fp) fclose(fp);
+		if (fp2) fclose(fp2);
+		return e;
+	}
+
+	snprintf(line, line_size, "%s|%d|%d|%d", nowstr,
+             dbmetrics->queries_total,
+             dbmetrics->queries_running,
+             dbmetrics->queries_queued);
+
+	if (strlen(line) + 1 == line_size){
+		gpmon_warning(FLINE, "dbmetrics line too long ... ignored: %s", line);
+		bytes_written = 0;
+	} else {
+		fprintf(fp, "%s\n", line);
+		fprintf(fp2, "%s\n", line);
+		bytes_written = strlen(line) + 1;
+	}
+
+    fclose(fp);
+    fclose(fp2);
+
+    return bytes_written;
+}
+
+static apr_uint32_t write_system(agg_t* agg, const char* nowstr)
+{
+	FILE* fp = fopen(GPMON_DIR "system_tail.dat", "a");
+	FILE* fp2 = fopen(GPMON_DIR "_system_now.dat", "w");
+	apr_hash_index_t* hi;
+	const int line_size = 1000;
+	char line[line_size];
+	apr_uint32_t bytes_written = 0;
+
+	if (!fp || !fp2)
+	{
+		if (fp) fclose(fp);
+		if (fp2) fclose(fp2);
+		return 0;
+ 	}
+
+	for (hi = apr_hash_first(0, agg->htab); hi; hi = apr_hash_next(hi))
+	{
+		gpmon_metrics_t* mp;
+		void* valptr = 0;
+		int quantum = gpmmon_quantum();
+		int bytes_this_line;
+		apr_hash_this(hi, 0, 0, (void**) &valptr);
+		mp = (gpmon_metrics_t*) valptr;
+
+		snprintf(line, line_size,
+		"%s|%s|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%.2f|%.2f|%.2f|%.4f|%.4f|%.4f|%d|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64 "|%" FMT64,
+		nowstr,
+		mp->hname,
+		mp->mem.total,
+		mp->mem.used,
+		mp->mem.actual_used,
+		mp->mem.actual_free,
+		mp->swap.total,
+		mp->swap.used,
+		(apr_int64_t)ceil((double)mp->swap.page_in / (double)quantum),
+		(apr_int64_t)ceil((double)mp->swap.page_out / (double)quantum),
+		mp->cpu.user_pct,
+		mp->cpu.sys_pct,
+		mp->cpu.idle_pct,
+		mp->load_avg.value[0],
+		mp->load_avg.value[1],
+		mp->load_avg.value[2],
+		quantum,
+		mp->disk.ro_rate,
+		mp->disk.wo_rate,
+		mp->disk.rb_rate,
+		mp->disk.wb_rate,
+		mp->net.rp_rate,
+		mp->net.wp_rate,
+		mp->net.rb_rate,
+		mp->net.wb_rate);
+
+		bytes_this_line = strlen(line) + 1;
+		if (bytes_this_line == line_size){
+			gpmon_warning(FLINE, "system metrics line too long ... ignored: %s", line);
+			continue;
+		}
+
+		fprintf(fp, "%s\n", line);
+		fprintf(fp2, "%s\n", line);
+
+		bytes_written += bytes_this_line;
+	}
+
+	fclose(fp);
+	fclose(fp2);
+	return bytes_written;
+}
+
+static apr_int64_t get_rowsout(qdnode_t* qdnode)
+{
+
+	apr_hash_index_t *hi;
+	//qenode_t* pqe = NULL;
+	apr_int64_t rowsout = 0;
+	void* valptr;
+	mmon_query_seginfo_t *query_seginfo;
+
+	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
+	{
+		apr_hash_this(hi, 0, 0, &valptr);
+		query_seginfo = (mmon_query_seginfo_t*) valptr;
+		if (query_seginfo->final_rowsout != -1)
+		{
+			rowsout = query_seginfo->final_rowsout;
+			break;
+		}
+	}
+	return rowsout;
+}
+
+
+static void _get_sum_seg_info(apr_hash_t* segtab, apr_int64_t* total_data_out, int* segcount_out)
+{
+	apr_hash_index_t *hi;
+	void* valptr;
+	apr_int64_t* seg_data_sum = NULL;
+
+	for (hi = apr_hash_first(NULL, segtab); hi; hi = apr_hash_next(hi))
+	{
+		apr_hash_this(hi, 0, 0, &valptr);
+		seg_data_sum = (apr_int64_t*) valptr;
+		*total_data_out += *seg_data_sum;
+		TR2(("(SKEW) Segment resource usage: %d\n", (int) *seg_data_sum));
+		(*segcount_out)++;
+	}
+}
+
+static void _get_sum_deviation_squared(apr_hash_t* segtab, const apr_int64_t data_avg, apr_int64_t* total_deviation_squared_out)
+{
+	apr_hash_index_t *hi;
+	void* valptr;
+	apr_int64_t* seg_data_sum = NULL;
+
+	for (hi = apr_hash_first(NULL, segtab); hi; hi = apr_hash_next(hi))
+	{
+		apr_int64_t dev = 0;
+
+		apr_hash_this(hi, NULL, NULL, &valptr);
+		seg_data_sum = (apr_int64_t*) valptr;
+		dev = *seg_data_sum - data_avg;
+		TR2(("(SKEW) Deviation: %d\n", (int) dev));
+		*total_deviation_squared_out += dev * dev;
+	}
+}
+
+static double get_cpu_skew(qdnode_t* qdnode)
+{
+	apr_pool_t* tmp_pool;
+	apr_hash_t* segtab;
+	apr_hash_index_t *hi;
+
+	apr_int64_t cpu_avg = 0;
+	apr_int64_t total_cpu = 0;
+	apr_int64_t total_deviation_squared = 0;
+	double variance = 0;
+	double standard_deviation = 0;
+	double coefficient_of_variation = 0;
+	apr_int64_t* seg_cpu_sum = NULL;
+	void* valptr;
+
+	int segcnt = 0;
+	int e;
+
+	if (!qdnode)
+		return 0.0f;
+
+	if (0 != (e = apr_pool_create_alloc(&tmp_pool, 0)))
+	{
+		gpmon_warningx(FLINE, e, "apr_pool_create_alloc failed");
+		return 0.0f;
+	}
+
+	segtab = apr_hash_make(tmp_pool);
+	if (!segtab)
+	{
+		gpmon_warning(FLINE, "Out of memory");
+		return 0.0f;
+	}
+
+	TR2(("Calc mean per segment\n"));
+
+	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
+	{
+		mmon_query_seginfo_t	*rec;
+		apr_hash_this(hi, 0, 0, &valptr);
+		rec = (mmon_query_seginfo_t*) valptr;
+
+		if (rec->key.segid == -1)
+			continue;
+
+		seg_cpu_sum = apr_hash_get(segtab, &rec->key.segid, sizeof(rec->key.segid));
+
+		if (!seg_cpu_sum) {
+			seg_cpu_sum = apr_palloc(tmp_pool, sizeof(apr_int64_t));
+			*seg_cpu_sum = 0;
+		}
+		*seg_cpu_sum += rec->sum_cpu_elapsed;
+		apr_hash_set(segtab, &rec->key.segid, sizeof(rec->key.segid), seg_cpu_sum);
+	}
+
+	_get_sum_seg_info(segtab, &total_cpu, &segcnt);
+
+	if (!segcnt) {
+		TR2(("No segments for CPU skew calculation\n"));
+		apr_pool_destroy(tmp_pool);
+		return 0.0f;
+	}
+
+	cpu_avg = total_cpu / segcnt;
+	TR2(("(SKEW) Avg resource usage: %" FMT64 "\n", cpu_avg));
+
+	_get_sum_deviation_squared(segtab, cpu_avg, &total_deviation_squared);
+
+	variance = total_deviation_squared / (double)segcnt;
+
+	standard_deviation = sqrt(variance);
+
+	TR2(("(SKEW) CPU standard deviation: %f\n", standard_deviation));
+
+	coefficient_of_variation = cpu_avg ? standard_deviation/(double)cpu_avg : 0.0f;
+
+	apr_pool_destroy(tmp_pool);
+	TR2(("(SKEW) CPU Skew: %f\n", coefficient_of_variation));
+
+	return coefficient_of_variation;
+}
+
+static double get_row_skew(qdnode_t* qdnode)
+{
+	apr_pool_t* tmp_pool;
+	apr_hash_t* segtab;
+	apr_hash_index_t *hi;
+
+	apr_int64_t total_row_out = 0;
+	apr_int64_t total_deviation_squared = 0;
+	double variance = 0.0f;
+	double standard_deviation = 0;
+	double coefficient_of_variation = 0;
+	apr_int64_t row_out_avg = 0;
+	apr_int64_t* seg_row_out_sum = NULL;
+	void* valptr;
+
+	int segcnt = 0;
+	int e;
+
+	if (!qdnode)
+		return 0.0f;
+
+	if (0 != (e = apr_pool_create_alloc(&tmp_pool, 0)))
+	{
+		gpmon_warningx(FLINE, e, "apr_pool_create_alloc failed");
+		return 0.0f;
+	}
+
+	segtab = apr_hash_make(tmp_pool);
+	if (!segtab)
+	{
+		gpmon_warning(FLINE, "Out of memory");
+		return 0.0f;
+	}
+
+	/* Calc rows in sum per segment */
+	TR2(("Calc rows in sum  per segment\n"));
+	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
+	{
+		mmon_query_seginfo_t	*rec;
+		apr_hash_this(hi, 0, 0, &valptr);
+		rec = (mmon_query_seginfo_t*) valptr;
+
+		if (rec->key.segid == -1)
+			continue;
+
+		seg_row_out_sum = apr_hash_get(segtab, &rec->key.segid, sizeof(rec->key.segid));
+
+		if (!seg_row_out_sum) {
+			seg_row_out_sum = apr_palloc(tmp_pool, sizeof(apr_int64_t));
+			*seg_row_out_sum = 0;
+		}
+		*seg_row_out_sum += rec->sum_measures_rows_out;
+		apr_hash_set(segtab, &rec->key.segid, sizeof(rec->key.segid), seg_row_out_sum);
+	}
+
+	_get_sum_seg_info(segtab, &total_row_out, &segcnt);
+
+	if (!segcnt) {
+		TR2(("No segments for Rows skew calculation\n"));
+		apr_pool_destroy(tmp_pool);
+		return 0.0f;
+	}
+
+	row_out_avg = total_row_out / segcnt;
+
+	TR2(("(SKEW) Avg rows out: %" FMT64 "\n", row_out_avg));
+
+	_get_sum_deviation_squared(segtab, row_out_avg, &total_deviation_squared);
+
+	variance = total_deviation_squared / (double)segcnt;
+	standard_deviation = sqrt(variance);
+
+	TR2(("(SKEW) Rows in standard deviaton: %f\n", standard_deviation));
+
+	coefficient_of_variation = row_out_avg ? standard_deviation/(double)row_out_avg : 0.0f;
+
+	apr_pool_destroy(tmp_pool);
+	TR2(("(SKEW) Rows out skew: %f\n", coefficient_of_variation));
+
+	return coefficient_of_variation;
+}
+
+
+static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const char* nowstr, apr_uint32_t done)
+{
+	char timsubmitted[GPMON_DATE_BUF_SIZE];
+	char timstarted[GPMON_DATE_BUF_SIZE];
+	char timfinished[GPMON_DATE_BUF_SIZE];
+	double cpu_skew = 0.0f;
+	double row_skew = 0.0f;
+	int query_hash = 0;
+	apr_int64_t rowsout = 0;
+	float cpu_current;
+	cpu_skew = get_cpu_skew(qdnode);
+	row_skew = get_row_skew(qdnode);
+	rowsout = get_rowsout(qdnode);
+	gpmon_datetime((time_t)qdnode->qlog.tsubmit, timsubmitted);
+
+	if (qdnode->qlog.tstart)
+	{
+		gpmon_datetime((time_t)qdnode->qlog.tstart, timstarted);
+	}
+	else
+	{
+		snprintf(timstarted, GPMON_DATE_BUF_SIZE, "null");
+	}
+
+	if (done)
+	{
+		cpu_current = 0.0f;
+		gpmon_datetime((time_t)qdnode->qlog.tfin, timfinished);
+	}
+	else
+	{
+		if (qdnode->num_metrics_packets)
+		{
+			// average cpu_pct per reporting machine
+			cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
+		}
+		else
+		{
+			cpu_current = 0.0f;
+		}
+		snprintf(timfinished, GPMON_DATE_BUF_SIZE,  "null");
+	}
+
+	snprintf(line, line_size, "%s|%d|%d|%d|%d|%s|%u|%d|%s|%s|%s|%s|%" FMT64 "|%" FMT64 "|%.4f|%.2f|%.2f|%d",
+		nowstr,
+		qdnode->qlog.key.tmid,
+		qdnode->qlog.key.ssid,
+		qdnode->qlog.key.ccnt,
+		qdnode->qlog.pid,
+		qdnode->qlog.user,
+		qdnode->qlog.dbid,
+		qdnode->qlog.cost,
+		timsubmitted,
+		timstarted,
+		timfinished,
+		gpmon_qlog_status_string(qdnode->qlog.status),
+		rowsout,
+		qdnode->qlog.cpu_elapsed,
+		cpu_current,
+		cpu_skew,
+		row_skew,
+		query_hash);
+}
+
+
+static apr_uint32_t write_qlog(FILE* fp, qdnode_t *qdnode, const char* nowstr, apr_uint32_t done)
+{
+	const int line_size = 1024;
+	char line[line_size];
+	int bytes_written;
+
+	fmt_qlog(line, line_size, qdnode, nowstr, done);
+	bytes_written = strlen(line) + 1;
+
+	if (bytes_written == line_size)
+	{
+		gpmon_warning(FLINE, "qlog line too long ... ignored: %s", line);
+		return 0;
+	}
+	else
+	{
+		/* Query text "joined" by python script */
+		fprintf(fp, "%s|||||\n", line);
+		return bytes_written;
+	}
+}
+
+static int get_and_print_next_query_file_kvp(FILE* outfd, FILE* queryfd, char* qfname, apr_uint32_t* bytes_written)
+{
+    const int line_size = 1024;
+    char line[line_size];
+    line[0] = 0;
+    char *p = NULL;
+    int field_len = 0;
+    int retCode = 0;
+
+    p = fgets(line, line_size, queryfd);
+    line[line_size-1] = 0; // in case libc is buggy
+
+    if (!p) {
+	    gpmon_warning(FLINE, "Error parsing file: %s", qfname);
+        return APR_NOTFOUND;
+    }
+
+    retCode = sscanf(p, "%d", &field_len);
+
+    if (1 != retCode){
+	    gpmon_warning(FLINE, "bad format on file: %s", qfname);
+        return APR_NOTFOUND;
+    }
+
+    if (field_len < 0) {
+	    gpmon_warning(FLINE, "bad field length on file: %s", qfname);
+        return APR_NOTFOUND;
+	}
+
+    if (!field_len) {
+        // empty field, read through the newline
+        p = fgets(line, line_size, queryfd);
+        if (p)
+            return APR_SUCCESS;
+        else
+            return APR_NOTFOUND;
+    }
+
+    fprintf(outfd, "\"");
+    (*bytes_written)++;
+
+    while (field_len > 0) {
+	    int max, n;
+        char* q;
+        max = field_len > sizeof(line) ? sizeof(line) : field_len;
+        n = fread(line, 1, max, queryfd);
+        for (p = line, q = line + n; p < q; p++)
+        {
+            if (*p == '"')
+            {
+                fputc('\"', outfd);
+                (*bytes_written)++;
+            }
+
+            fputc(*p, outfd);
+
+            (*bytes_written)++;
+
+        }
+        field_len -= n;
+        if (n < max) break;
+    }
+
+	fprintf(outfd, "\"");
+	(*bytes_written)++;
+
+    int n = fread(line, 1, 1, queryfd);
+    if (n != 1)
+    {
+	    gpmon_warning(FLINE, "missing expected newline in file: %s", qfname);
+        return APR_NOTFOUND;
+    }
+
+    return APR_SUCCESS;
+}
+
+static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nowstr)
+{
+	const int line_size = 1024;
+	const int qfname_size = 256;
+    char line[line_size];
+    char qfname[qfname_size];
+    FILE* qfptr = 0;
+    apr_uint32_t bytes_written = 0;
+
+    fmt_qlog(line, line_size, qdnode, nowstr, 1);
+    bytes_written = strlen(line) + 1;
+	if (bytes_written == line_size)
+	{
+		gpmon_warning(FLINE, "qlog line too long ... ignored: %s", line);
+		return 0;
+	}
+
+	fprintf(fp, "%s", line);
+
+	snprintf(qfname, qfname_size, GPMON_DIR "q%d-%d-%d.txt", qdnode->qlog.key.tmid,
+            qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt);
+
+	qfptr = fopen(qfname, "r");
+    if (!qfptr)
+    {
+	    fprintf(fp, "|||||\n");
+	    bytes_written += 6;
+	    return bytes_written;
+    }
+
+    // 0 add query text
+    // 1 add query plan
+    // 2 add application name
+    // 3 add rsqname
+    // 4 add priority
+
+    int total_iterations = 5;
+    int all_good = 1;
+    int iter;
+    int retCode = APR_SUCCESS;
+    for (iter = 0; iter < total_iterations; ++iter)
+    {
+	    fprintf(fp, "|");
+        bytes_written++;
+
+        if (!all_good || iter == 1){
+            // we have no data for query plan
+            // if we failed once already don't bother trying to parse query file
+            continue;
+        }
+
+        retCode = get_and_print_next_query_file_kvp(fp, qfptr, qfname, &bytes_written);
+        if (retCode != APR_SUCCESS)
+            all_good = 0;
+    }
+
+    fprintf(fp, "\n");
+    fclose(qfptr);
+	return bytes_written;
+}
+
+static void bloom_init(bloom_t* bloom)
+{
+    memset(bloom->map, 0, sizeof(bloom->map));
+}
+
+static void bloom_set(bloom_t* bloom, const char* name)
+{
+    apr_ssize_t namelen = strlen(name);
+    const unsigned int hashval =
+	apr_hashfunc_default(name, &namelen) % (8 * sizeof(bloom->map));
+    const int idx = hashval / 8;
+    const int off = hashval % 8;
+    /* printf("bloom set %s h%d\n", name, hashval); */
+    bloom->map[idx] |= (1 << off);
+}
+
+static int bloom_isset(bloom_t* bloom, const char* name)
+{
+    apr_ssize_t namelen = strlen(name);
+    const unsigned int hashval =
+	apr_hashfunc_default(name, &namelen) % (8 * sizeof(bloom->map));
+    const int idx = hashval / 8;
+    const int off = hashval % 8;
+    /*
+      printf("bloom check %s h%d = %d\n", name, hashval,
+      0 != (bloom->map[idx] & (1 << off)));
+    */
+    return 0 != (bloom->map[idx] & (1 << off));
+}
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.h b/contrib/perfmon/src/gpmmon/gpmon_agg.h
new file mode 100644
index 00000000000..2267a5e2790
--- /dev/null
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.h
@@ -0,0 +1,14 @@
+#ifndef GPMON_AGG_H
+#define GPMON_AGG_H
+
+#include "apr_pools.h"
+#include "gpmonlib.h"
+
+typedef struct agg_t agg_t;
+apr_status_t agg_create(agg_t** retagg, apr_int64_t generation, apr_pool_t* parent_pool, apr_hash_t* fsinfotab);
+apr_status_t agg_dup(agg_t** agg, agg_t* oldagg, apr_pool_t* pool, apr_hash_t* fsinfotab);
+void agg_destroy(agg_t* agg);
+apr_status_t agg_put(agg_t* agg, const gp_smon_to_mmon_packet_t* pkt);
+apr_status_t agg_dump(agg_t* agg);
+
+#endif
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.c b/contrib/perfmon/src/gpmmon/gpmondb.c
new file mode 100644
index 00000000000..746808dcb7e
--- /dev/null
+++ b/contrib/perfmon/src/gpmmon/gpmondb.c
@@ -0,0 +1,1729 @@
+#include "postgres.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "gpmonlib.h"
+#include "gpmondb.h"
+#include "libpq-fe.h"
+#include "apr_strings.h"
+#include "apr_file_io.h"
+#include "time.h"
+
+int gpdb_exec_search_for_at_least_one_row(const char*, PGconn*);
+apr_status_t empty_harvest_file(const char*, apr_pool_t*, PGconn*);
+apr_status_t truncate_tail_file(const char*, apr_pool_t*, PGconn*);
+void upgrade_log_alert_table_distributed_key(PGconn*);
+
+#define GPMON_HOSTTTYPE_HDW 1
+#define GPMON_HOSTTTYPE_HDM 2
+#define GPMON_HOSTTTYPE_ETL 3
+#define GPMON_HOSTTTYPE_HBW 4
+#define GPMON_HOSTTTYPE_HDC 5
+
+#define MAX_SMON_PATH_SIZE (1024)
+#define MAX_OWNER_LENGTH   (100)
+
+#define GPDB_CONNECTION_STRING "dbname='" GPMON_DB "' user='" GPMON_DBUSER "' connect_timeout='30'"
+Oid gpperfmon_dbid;
+
+int find_token_in_config_string(char* buffer, char**result, const char* token)
+{
+	return 1;
+}
+
+// assumes a valid connection already exists
+static const char* gpdb_exec_only(PGconn* conn, PGresult** pres, const char* query)
+{
+	PGresult* res = 0;
+	ExecStatusType status;
+
+	TR1(("Query: %s\n", query));
+
+	res = PQexec(conn, query);
+	status = PQresultStatus(res);
+	if (status != PGRES_COMMAND_OK && status != PGRES_TUPLES_OK)
+		return PQerrorMessage(conn);
+
+	*pres = res;
+	return 0;
+}
+
+static const bool gpdb_exec_ddl(PGconn* conn, const char* ddl_query)
+{
+	PGresult *result = NULL;
+	const char *errmsg = gpdb_exec_only(conn, &result, ddl_query);
+	PQclear(result);
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "failed to execute query '%s': %s\n", ddl_query, errmsg);
+	}
+	return errmsg == NULL;
+}
+
+// creates a connection and then runs the query
+static const char* gpdb_exec(PGconn** pconn, PGresult** pres, const char* query)
+{
+	const char *connstr = "dbname='" GPMON_DB "' user='" GPMON_DBUSER
+	"' connect_timeout='30'";
+	PGconn *conn = NULL;
+
+	conn = PQconnectdb(connstr);
+	// early assignment to pconn guarantees connection available to get freed by the caller
+	*pconn = conn;
+
+	if (PQstatus(conn) != CONNECTION_OK)
+		return PQerrorMessage(conn);
+
+	return gpdb_exec_only(conn, pres, query);
+}
+
+// persistant_conn is optional if you are already holding an open connectionconn
+// return 1 if more than 0 rows are returned from query
+// return 0 if zero rows are returned from query
+int gpdb_exec_search_for_at_least_one_row(const char* QUERY, PGconn* persistant_conn)
+{
+	PGconn* conn = 0;
+	PGresult* result = 0;
+	int rowcount;
+	int res = 0;
+	const char* errmsg;
+
+	if (persistant_conn)
+	    errmsg = gpdb_exec_only(persistant_conn, &result, QUERY);
+	else
+	    errmsg = gpdb_exec(&conn, &result, QUERY);
+
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, QUERY);
+	}
+	else
+	{
+		rowcount = PQntuples(result);
+		if (rowcount > 0)
+			res = 1;
+	}
+
+	PQclear(result);
+
+	if (conn)
+	    PQfinish(conn);
+
+	return res;
+}
+
+static bool should_recreate_from_result(PGresult	*result,
+										const char	*encoding,
+										bool		script_exist,
+										int			expected_encoding_num)
+{
+	ASSERT(result);
+	ASSERT(encoding);
+	int rowcount = PQntuples(result);
+	if (rowcount > 0)
+	{
+		const char* cmd = PQgetvalue(result, 0, 0);
+		int encoding_num = atoi(PQgetvalue(result, 0, 1));
+		ASSERT(cmd);
+		const int MAX_ICONV_CMD_LEN = 50;
+		char iconv[MAX_ICONV_CMD_LEN];
+		snprintf(iconv, sizeof(iconv), "iconv -f %s -t %s -c", encoding, encoding);
+		const char gpperfmoncat[] = "gpperfmoncat.sh";
+		if (strncmp(cmd, iconv, sizeof(iconv)) != 0)
+		{
+			if (!script_exist && encoding_num == expected_encoding_num)
+			{
+				return false;
+			}
+		}
+		else if (strncmp(cmd, gpperfmoncat, sizeof(gpperfmoncat)))
+		{
+			if (script_exist && encoding_num == expected_encoding_num)
+			{
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
+// Whether log_alert_table needs to be recreated. The order command is
+// EXECUTE 'cat ...' which would crash if gpdb-alert log files contain invalid
+// character. We use 'iconv' instead of 'cat' to fix it.
+// returns true if given table exist and uses 'cat' instead of 'iconv', then should be
+// recreated.
+// return false otherwise
+static bool gpdb_should_recreate_log_alert(PGconn		*conn,
+										   const char	*table_name,
+										   const char	*encoding,
+										   int			expected_encoding_num,
+										   bool			script_exist)
+{
+	ASSERT(conn);
+	ASSERT(strcasecmp(table_name, "log_alert_tail") == 0 ||
+		   strcasecmp(table_name, "log_alert_now") == 0);
+
+	PGresult *result = 0;
+
+	const char* errmsg = NULL;
+
+	const char* pattern = "select a.command, a.encoding from pg_exttable a, pg_class b "
+		"where a.reloid = b.oid and b.relname='%s'";
+
+	const int QRYBUFSIZ = 2000;
+	char query[QRYBUFSIZ];
+	snprintf(query, QRYBUFSIZ, pattern, table_name);
+
+	bool ret = true;
+
+	if (conn)
+	    errmsg = gpdb_exec_only(conn, &result, query);
+
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, query);
+	}
+	else
+	{
+		ret = should_recreate_from_result(result,
+										  encoding,
+										  script_exist,
+										  expected_encoding_num);
+	}
+
+	PQclear(result);
+	return ret;
+}
+
+int gpdb_validate_gpperfmon(void)
+{
+	/* Check db */
+	gpperfmon_dbid = gpdb_gpperfmon_dbid();
+	if (!gpperfmon_dbid)
+		return 0;
+
+	/* check post */
+	if (!perfmon_port)
+		return 0;
+
+	/* check external tables are accessable by gpmon user */
+	if (!gpdb_validate_ext_table_access())
+		return 0;
+
+	return 1;
+}
+
+Oid gpdb_gpperfmon_dbid(void)
+{
+	Oid	dbid = InvalidOid;
+	PGconn* conn = 0;
+	PGresult *result = NULL;
+	int rowcount;
+
+	const char* QUERY = "select oid as dbid from pg_database "
+						"where datname='gpperfmon'";
+	const char* errmsg = gpdb_exec(&conn, &result, QUERY);
+	if (errmsg)
+	{
+		fprintf(stderr, "Performance Monitor - failed to connect to gpperfmon database: %s",
+						(errmsg == NULL ? "unknown reason" : errmsg));
+	}
+	else
+	{
+		rowcount = PQntuples(result);
+		if (rowcount > 0)
+		{
+			dbid = DatumGetObjectId(CStringGetDatum(PQgetvalue(result, 0, 0)));
+		}
+	}
+	PQclear(result);
+	PQfinish(conn);
+	return dbid;
+}
+
+int gpdb_validate_ext_table_access(void)
+{
+	const char* QUERY = "select * from master_data_dir";
+	return gpdb_exec_search_for_at_least_one_row(QUERY, NULL);
+}
+
+struct hostinfo_holder_t
+{
+	addressinfo_holder_t* addressinfo_head;
+	addressinfo_holder_t* addressinfo_tail;
+	apr_uint32_t address_count;
+
+	char* datadir;
+	char* smon_dir;
+	char* hostname;
+	int is_master;
+	int is_hdm;
+	int is_hdw;
+	int is_hbw;
+	int is_hdc;
+	int is_etl;
+};
+
+void initializeHostInfoDataWithAddress(struct hostinfo_holder_t*, char*, int);
+void initializeHostInfoDataFromFileEntry(apr_pool_t*, struct hostinfo_holder_t*,char*, char*, int, char*, char*);
+
+
+void initializeHostInfoDataWithAddress(struct hostinfo_holder_t* holder, char* address, int firstAddress)
+{
+	// USE permenant memory to store this data
+
+	addressinfo_holder_t* aiholder = calloc(1, sizeof(addressinfo_holder_t));
+	CHECKMEM(aiholder);
+
+	aiholder->address = strdup(address);
+	CHECKMEM(aiholder->address);
+
+	if (firstAddress)
+	{
+		holder->addressinfo_head = holder->addressinfo_tail = aiholder;
+	}
+	else
+	{
+		holder->addressinfo_tail->next = aiholder;
+		holder->addressinfo_tail = aiholder;
+	}
+}
+
+void initializeHostInfoDataFromFileEntry(apr_pool_t* tmp_pool, struct hostinfo_holder_t* holder,
+		char* primary_hostname, char* hostEntry, int hostType, char* smon_bin_dir, char* smon_log_dir)
+{
+	holder->hostname = apr_pstrdup(tmp_pool, primary_hostname);
+	CHECKMEM(holder->hostname);
+
+	holder->smon_dir = apr_pstrdup(tmp_pool, smon_bin_dir);
+	CHECKMEM(holder->smon_dir);
+
+	holder->datadir = apr_pstrdup(tmp_pool, smon_log_dir);
+	CHECKMEM(holder->datadir);
+
+	switch(hostType)
+	{
+		case GPMON_HOSTTTYPE_HDW:
+			holder->is_hdw = 1;
+		break;
+		case GPMON_HOSTTTYPE_HDM:
+			holder->is_hdm = 1;
+		break;
+		case GPMON_HOSTTTYPE_ETL:
+			holder->is_etl = 1;
+		break;
+		case GPMON_HOSTTTYPE_HBW:
+			holder->is_hbw = 1;
+		break;
+		case GPMON_HOSTTTYPE_HDC:
+			holder->is_hdc = 1;
+		break;
+	}
+
+	holder->address_count = 0;
+	int firstAddress = 1;
+
+	while(*hostEntry)
+	{
+		char* location = strchr(hostEntry, ',');
+		if (location)
+			*location = 0;
+
+		initializeHostInfoDataWithAddress(holder, hostEntry, firstAddress);
+		holder->address_count++;
+		if (!location)
+			return; // there were no commas so this is the last address in the hostEntry
+		*location = ',';
+		hostEntry = location+1;
+		firstAddress = 0;
+	}
+}
+
+void process_line_in_hadoop_cluster_info(apr_pool_t* tmp_pool, apr_hash_t* htab, char* line, char* smon_bin_location, char* smon_log_location)
+{
+	if (!line)
+	{
+		gpmon_warningx(FLINE, 0, "Line in hadoop cluster info file is null, skipping");
+		return;
+	}
+
+	char* host;
+	char* category;
+
+	char primary_hostname[64];
+
+	char* location = strchr(line, '#');
+	if (location)
+	{
+		*location = 0; // remove comments from the line
+	}
+
+	// we do these in reverse order so inserting null chars does not prevent finding other tokens
+	if (find_token_in_config_string(line, &category, "Categories"))
+	{
+		return;
+	}
+	location = strchr(category, ','); //remove the comma and extra categories
+	if (location)
+	{
+		*location = 0;
+	}
+
+	if (find_token_in_config_string(line, &host, "Hostname"))
+	{
+		return;
+	}
+	TR1(("Found hadoop host %s\n",host ));
+	// look for the 3 hadoop host types
+	int monitored_device = 0;
+	int hostType = 0;
+	if (strcmp(category, "hdm") == 0)
+	{
+		monitored_device = 1;
+		hostType = GPMON_HOSTTTYPE_HDM;
+	}
+
+	if (strcmp(category, "hdw") == 0)
+	{
+		monitored_device = 1;
+		hostType = GPMON_HOSTTTYPE_HDW;
+	}
+
+	if (strcmp(category, "hdc") == 0)
+	{
+		monitored_device = 1;
+		hostType = GPMON_HOSTTTYPE_HDC;
+	}
+	// The below code is the same as the devices file parsing code
+
+	// segment host, switch, etc ... we are only adding additional hosts required for performance monitoring
+	if (!monitored_device)
+	{
+		return;
+	}
+
+	strncpy(primary_hostname, host, sizeof(primary_hostname));
+	primary_hostname[sizeof(primary_hostname) - 1] = 0;
+	location = strchr(primary_hostname, ',');
+	if (location)
+	{
+		*location = 0;
+	}
+
+	struct hostinfo_holder_t* hostinfo_holder = apr_hash_get(htab, primary_hostname, APR_HASH_KEY_STRING);
+	if (hostinfo_holder)
+	{
+		gpmon_warningx(FLINE, 0, "Host '%s' is duplicated in clusterinfo.txt", primary_hostname);
+		return;
+	}
+
+	// OK Lets add this record at this point
+	hostinfo_holder = apr_pcalloc(tmp_pool, sizeof(struct hostinfo_holder_t));
+	CHECKMEM(hostinfo_holder);
+
+	apr_hash_set(htab, primary_hostname, APR_HASH_KEY_STRING, hostinfo_holder);
+
+	initializeHostInfoDataFromFileEntry(tmp_pool, hostinfo_holder, primary_hostname, host, hostType, smon_bin_location, smon_log_location);
+}
+
+//Return 1 if not a hadoop software only cluster and 0 it is a hadoop software only cluster
+int get_hadoop_hosts_and_add_to_hosts(apr_pool_t* tmp_pool, apr_hash_t* htab, mmon_options_t* opt)
+{
+	if (!opt->smon_hadoop_swonly_binfile)
+	{
+		TR0(("hadoop_smon_path not specified in gpmmon config. not processing hadoop nodes\n"));
+		return 1;
+	}
+
+	char* smon_log_dir;
+	char* hadoop_cluster_file;
+	if (opt->smon_hadoop_swonly_logdir)
+	{
+		smon_log_dir = opt->smon_hadoop_swonly_logdir;
+	}
+	else
+	{
+		smon_log_dir = (char*)PATH_TO_HADOOP_SMON_LOGS;
+	}
+	if (opt->smon_hadoop_swonly_clusterfile)
+	{
+		hadoop_cluster_file = opt->smon_hadoop_swonly_clusterfile;
+	}
+	else
+	{
+		hadoop_cluster_file = (char*)DEFAULT_PATH_TO_HADOOP_HOST_FILE;
+	}
+
+	FILE* fd = fopen(hadoop_cluster_file, "r");
+	if (!fd)
+	{
+		TR0(("not a hadoop software only cluster ... not reading %s\n", hadoop_cluster_file));
+		return 1;
+	}
+
+	char* line;
+	char buffer[1024];
+
+	// process the hostlines
+	while (NULL != fgets(buffer, sizeof(buffer), fd))
+	{
+		line = gpmon_trim(buffer);// remove new line
+		process_line_in_hadoop_cluster_info(tmp_pool, htab, line, opt->smon_hadoop_swonly_binfile, smon_log_dir);
+	}
+
+	fclose(fd);
+	return 0;
+}
+
+
+void gpdb_get_hostlist(int* hostcnt, host_t** host_table, apr_pool_t* global_pool, mmon_options_t* opt)
+{
+	apr_pool_t* pool;
+	PGconn* conn = 0;
+	PGresult* result = 0;
+	int rowcount, i;
+	unsigned int unique_hosts = 0;
+	apr_hash_t* htab;
+	struct hostinfo_holder_t* hostinfo_holder = NULL;
+	host_t* hosts = NULL;
+	int e;
+
+	// 0 -- hostname, 1 -- address, 2 -- datadir, 3 -- is_master,
+	const char *QUERY = "SELECT distinct hostname, address, case when content < 0 then 1 else 0 end as is_master, MAX(datadir) as datadir FROM gp_segment_configuration "
+		  	    "GROUP BY (hostname, address, is_master) order by hostname";
+
+	if (0 != (e = apr_pool_create_alloc(&pool, NULL)))
+	{
+		gpmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
+	}
+
+	const char* errmsg = gpdb_exec(&conn, &result, QUERY);
+
+	TR2(("%s\n", QUERY));
+
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, QUERY);
+	}
+	else
+	{
+		// hash of hostnames to addresses
+		htab = apr_hash_make(pool);
+
+		rowcount = PQntuples(result);
+
+		for (i = 0; i < rowcount; i++)
+		{
+			char* curr_hostname = PQgetvalue(result, i, 0);
+
+			hostinfo_holder = apr_hash_get(htab, curr_hostname, APR_HASH_KEY_STRING);
+
+			if (!hostinfo_holder)
+			{
+				hostinfo_holder = apr_pcalloc(pool, sizeof(struct hostinfo_holder_t));
+				CHECKMEM(hostinfo_holder);
+
+				apr_hash_set(htab, curr_hostname, APR_HASH_KEY_STRING, hostinfo_holder);
+
+				hostinfo_holder->hostname = curr_hostname;
+				hostinfo_holder->is_master = atoi(PQgetvalue(result, i, 2));
+				hostinfo_holder->datadir = PQgetvalue(result, i, 3);
+
+				// use permenant memory for address list -- stored for duration
+
+				// populate 1st on list and save to head and tail
+				hostinfo_holder->addressinfo_head = hostinfo_holder->addressinfo_tail = calloc(1, sizeof(addressinfo_holder_t));
+				CHECKMEM(hostinfo_holder->addressinfo_tail);
+
+				// first is the hostname
+				hostinfo_holder->addressinfo_tail->address = strdup(hostinfo_holder->hostname);
+				CHECKMEM(hostinfo_holder->addressinfo_tail->address);
+
+
+				// add a 2nd to the list
+				hostinfo_holder->addressinfo_tail->next = calloc(1, sizeof(addressinfo_holder_t));
+				CHECKMEM(hostinfo_holder->addressinfo_tail);
+				hostinfo_holder->addressinfo_tail = hostinfo_holder->addressinfo_tail->next;
+
+				// second is address
+				hostinfo_holder->addressinfo_tail->address = strdup(PQgetvalue(result, i, 1));
+				CHECKMEM(hostinfo_holder->addressinfo_tail->address);
+
+				// one for hostname one for address
+				hostinfo_holder->address_count = 2;
+			}
+			else
+			{
+				// permenant memory for address list -- stored for duration
+				hostinfo_holder->addressinfo_tail->next = calloc(1, sizeof(addressinfo_holder_t));
+				CHECKMEM(hostinfo_holder->addressinfo_tail);
+
+				hostinfo_holder->addressinfo_tail = hostinfo_holder->addressinfo_tail->next;
+
+				// permenant memory for address list -- stored for duration
+				hostinfo_holder->addressinfo_tail->address = strdup(PQgetvalue(result, i, 1));
+				CHECKMEM(hostinfo_holder->addressinfo_tail->address);
+
+				hostinfo_holder->address_count++;
+			}
+
+		}
+
+		TR0(("checking for SW Only hadoop hosts.\n"));
+		get_hadoop_hosts_and_add_to_hosts(pool, htab, opt);
+
+		unique_hosts = apr_hash_count(htab);
+
+		// allocate memory for host list (not freed ever)
+		hosts = calloc(unique_hosts, sizeof(host_t));
+
+		apr_hash_index_t* hi;
+		void* vptr;
+		int hostcounter = 0;
+		for (hi = apr_hash_first(0, htab); hi; hi = apr_hash_next(hi))
+		{
+			// sanity check
+			if (hostcounter >= unique_hosts)
+			{
+				gpmon_fatalx(FLINE, 0, "host counter exceeds unique hosts");
+			}
+
+			apr_hash_this(hi, 0, 0, &vptr);
+			hostinfo_holder = vptr;
+
+			hosts[hostcounter].hostname = strdup(hostinfo_holder->hostname);
+			hosts[hostcounter].data_dir = strdup(hostinfo_holder->datadir);
+			if (hostinfo_holder->smon_dir)
+			{
+				hosts[hostcounter].smon_bin_location = strdup(hostinfo_holder->smon_dir);
+			}
+			hosts[hostcounter].is_master = hostinfo_holder->is_master;
+			hosts[hostcounter].addressinfo_head = hostinfo_holder->addressinfo_head;
+			hosts[hostcounter].addressinfo_tail = hostinfo_holder->addressinfo_tail;
+			hosts[hostcounter].address_count = hostinfo_holder->address_count;
+			hosts[hostcounter].connection_hostname.current = hosts[hostcounter].addressinfo_head;
+
+			if (hostinfo_holder->is_hdm)
+				hosts[hostcounter].is_hdm = 1;
+
+			if (hostinfo_holder->is_hdw)
+				hosts[hostcounter].is_hdw = 1;
+
+			if (hostinfo_holder->is_etl)
+				hosts[hostcounter].is_etl = 1;
+
+			if (hostinfo_holder->is_hbw)
+				hosts[hostcounter].is_hbw = 1;
+
+			if (hostinfo_holder->is_hdc)
+				hosts[hostcounter].is_hdc = 1;
+
+			apr_thread_mutex_create(&hosts[hostcounter].mutex, APR_THREAD_MUTEX_UNNESTED, global_pool); // use the global pool so the mutexes last beyond this function
+
+			hostcounter++;
+		}
+
+		*hostcnt = hostcounter;
+	}
+
+	apr_pool_destroy(pool);
+	PQclear(result);
+	PQfinish(conn);
+
+	if (!hosts || *hostcnt < 1)
+	{
+		gpmon_fatalx(FLINE, 0, "no valid hosts found");
+	}
+
+	*host_table = hosts;
+}
+
+void gpdb_get_master_data_dir(char** hostname, char** mstrdir, apr_pool_t* pool)
+{
+	PGconn* conn = 0;
+	PGresult* result = 0;
+	const char* QUERY = "select * from master_data_dir";
+	char* dir = 0;
+	char* hname = 0;
+	int rowcount;
+	const char* errmsg = gpdb_exec(&conn, &result, QUERY);
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, QUERY);
+	}
+	else
+	{
+		rowcount = PQntuples(result);
+		if (rowcount > 0)
+		{
+			hname = PQgetvalue(result, 0, 0);
+			dir = PQgetvalue(result, 0, 1);
+		}
+
+		if (!hname || !dir)
+		{
+			gpmon_warning(FLINE, "unable to get master data directory");
+		}
+		else
+		{
+			hname = apr_pstrdup(pool, gpmon_trim(hname));
+			CHECKMEM(hname);
+
+			dir = apr_pstrdup(pool, gpmon_trim(dir));
+			CHECKMEM(dir);
+		}
+	}
+
+	PQclear(result);
+	PQfinish(conn);
+
+	*hostname = hname;
+	*mstrdir = dir;
+}
+
+void gpdb_get_single_string_from_query(const char* QUERY, char** resultstring, apr_pool_t* pool)
+{
+	PGconn* conn = 0;
+	PGresult* result = 0;
+	char* tmpoutput = 0;
+	int rowcount;
+	const char* errmsg = gpdb_exec(&conn, &result, QUERY);
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, QUERY);
+	}
+	else
+	{
+		rowcount = PQntuples(result);
+		if (rowcount == 1)
+		{
+			tmpoutput = PQgetvalue(result, 0, 0);
+		}
+		else if (rowcount > 1)
+		{
+			gpmon_warning(FLINE, "unexpected number of rows returned from query %s", QUERY);
+		}
+
+		if (tmpoutput)
+		{
+			tmpoutput = apr_pstrdup(pool, gpmon_trim(tmpoutput));
+			CHECKMEM(tmpoutput);
+		}
+	}
+
+	PQclear(result);
+	PQfinish(conn);
+
+	*resultstring = tmpoutput;
+}
+
+
+static void check_and_add_partition(PGconn* conn, const char* tbl, int begin_year, int begin_month, int end_year, int end_month)
+{
+	PGresult* result = 0;
+	const char* errmsg;
+	const int QRYBUFSIZ = 1024;
+
+	char qry[QRYBUFSIZ];
+	/* pg_partitions has been removed
+	 * https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/install_guide-migrate-classic-partitioning.html
+	 */
+	const char* CHK_QRYFMT = "select dt from (select substring(pg_get_expr(relpartbound, oid) from 19 for 19) as dt from pg_class where relname like '%s_history_%%' and relispartition = true ) as TBL where TBL.dt = '%d-%02d-01 00:00:00';";
+	const char* ADD_QRYFMT = "alter table %s_history add partition start ('%d-%02d-01 00:00:00'::timestamp without time zone) inclusive end ('%d-%02d-01 00:00:00'::timestamp without time zone) exclusive;";
+
+	snprintf(qry, QRYBUFSIZ, CHK_QRYFMT, tbl, begin_year, begin_month);
+	if (!gpdb_exec_search_for_at_least_one_row(qry, conn))
+	{
+		// this partition does not exist, create it
+
+		snprintf(qry, QRYBUFSIZ, ADD_QRYFMT, tbl, begin_year, begin_month, end_year, end_month);
+		TR0(("Add partition table '%s\n'", qry));
+		errmsg = gpdb_exec_only(conn, &result, qry);
+		if (errmsg)
+		{
+			gpmon_warning(FLINE, "partition add response from server: %s\n", errmsg);
+		}
+		PQclear(result);
+	}
+}
+
+// Drop old partitions if partition_age option is set.
+static void drop_old_partitions(PGconn* conn, const char* tbl, mmon_options_t *opt)
+{
+	const int QRYBUFSIZ = 1024;
+	PGresult* result = NULL;
+	const char* errmsg;
+	char qry[QRYBUFSIZ];
+
+	const char* SELECT_QRYFMT = "SELECT relname, substring(pg_get_expr(relpartbound, oid) from 19 for 19)"
+								"as partitionrangestart FROM pg_class "
+								"WHERE relanme like '%s_history_%%' AND relispartition=true"
+								"ORDER BY partitionrangestart DESC OFFSET %d;";
+	const char* DROP_QRYFMT   = "ALTER TABLE %s_history DROP PARTITION IF EXISTS FOR (%s);";
+
+	int partition_age = opt->partition_age;
+
+	if (partition_age <= 0) {
+		TR0(("partition_age turned off\n"));
+		return;
+	}
+
+	// partition_age + 1 because we always add 2 partitions for the boundary case
+	snprintf(qry, QRYBUFSIZ, SELECT_QRYFMT, tbl, partition_age + 1);
+
+	errmsg = gpdb_exec_only(conn, &result, qry);
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "drop partition: select query '%s' response from server: %s\n", qry, errmsg);
+	}
+	else
+	{
+		int rowcount = PQntuples(result);
+		int i = 0;
+		for (; i < rowcount; i++)
+		{
+			PGresult* dropResult = NULL;
+			char* partitiontablename  = PQgetvalue(result, i, 0);
+			char* partitionrangestart = PQgetvalue(result, i, 1);
+
+			// partitionrangestart comes out looking like `'2017-02-01 00:00:00'::timestamp(0) without time zone`
+			//                                       or   `'2010-01-01 00:00:00-08'::timestamp with time zone`
+			char *unwanted = strstr(partitionrangestart, "::" );
+
+			size_t substring_size = unwanted - partitionrangestart + 1;
+			char *substring = (char *) malloc(substring_size);
+			memcpy(substring, partitionrangestart, substring_size);
+			substring[substring_size - 1] = '\0';
+
+			snprintf(qry, QRYBUFSIZ, DROP_QRYFMT, tbl, substring);
+
+			free(substring);
+			TR0(("Dropping partition table '%s'\n", partitiontablename));
+			errmsg = gpdb_exec_only(conn, &dropResult, qry);
+			PQclear(dropResult);
+			if (errmsg)
+			{
+				gpmon_warning(FLINE, "drop partition: drop query '%s' response from server: %s\n", qry, errmsg);
+				break;
+			}
+		}
+	}
+	PQclear(result);
+}
+
+
+static apr_status_t check_partition(const char* tbl, apr_pool_t* pool, PGconn* conn, mmon_options_t *opt)
+{
+	struct tm tm;
+	time_t now;
+
+	unsigned short year[3];
+	unsigned char month[3];
+
+	TR0(("check partitions on %s_history\n", tbl));
+
+	if (!conn)
+		return APR_ENOMEM;
+
+	now = time(NULL);
+	if (!localtime_r(&now, &tm))
+	{
+		gpmon_warning(FLINE, "error in check_partition getting current time\n");
+		return APR_EGENERAL;
+	}
+
+	year[0] = 1900 + tm.tm_year;
+	month[0] = tm.tm_mon+1;
+
+	if (year[0] < 1 || month[0] < 1 || year[0] > 2030 || month[0] > 12)
+	{
+		gpmon_warning(FLINE, "invalid current month/year in check_partition %u/%u\n", month[0], year[0]);
+		return APR_EGENERAL;
+	}
+
+	if (month[0] < 11)
+	{
+		month[1] = month[0] + 1;
+		month[2] = month[0] + 2;
+
+		year[1] = year[0];
+		year[2] = year[0];
+	}
+	else if (month[0] == 11)
+	{
+		month[1] = 12;
+		month[2] = 1;
+
+		year[1] = year[0];
+		year[2] = year[0] + 1;
+	}
+	else
+	{
+		month[1] = 1;
+		month[2] = 2;
+
+		year[1] = year[0] + 1;
+		year[2] = year[0] + 1;
+	}
+
+	check_and_add_partition(conn, tbl, year[0], month[0], year[1], month[1]);
+	check_and_add_partition(conn, tbl, year[1], month[1], year[2], month[2]);
+
+	drop_old_partitions(conn, tbl, opt);
+
+	TR0(("check partitions on %s_history done\n", tbl));
+	return APR_SUCCESS;
+}
+
+static apr_status_t harvest(const char* tbl, apr_pool_t* pool, PGconn* conN)
+{
+	PGconn* conn = 0;
+	PGresult* result = 0;
+	const int QRYBUFSIZ = 255;
+	char qrybuf[QRYBUFSIZ];
+	const char* QRYFMT = "insert into %s_history select * from _%s_tail;";
+	const char* errmsg;
+	apr_status_t res = APR_SUCCESS;
+
+	snprintf(qrybuf, QRYBUFSIZ, QRYFMT, tbl, tbl);
+
+	errmsg = gpdb_exec(&conn, &result, qrybuf);
+	if (errmsg)
+	{
+		res = 1;
+		gpmon_warningx(FLINE, 0, "---- HARVEST %s FAILED ---- on query %s with error %s\n", tbl, qrybuf, errmsg);
+	}
+	else
+	{
+		TR1(("load completed OK: %s\n", tbl));
+	}
+
+	PQclear(result);
+	PQfinish(conn);
+	return res;
+}
+
+/**
+ * This function removes the not null constraint from the segid column so that
+ * we can set it to null when the segment aggregation flag is true
+ */
+apr_status_t remove_segid_constraint(void)
+{
+	PGconn* conn = 0;
+	PGresult* result = 0;
+	const char* ALTERSTR = "alter table iterators_history alter column segid drop not null;";
+	const char* errmsg;
+	apr_status_t res = APR_SUCCESS;
+
+	errmsg = gpdb_exec(&conn, &result, ALTERSTR);
+	if (errmsg)
+	{
+		res = 1;
+		gpmon_warningx(FLINE, 0, "---- Alter FAILED ---- on command: %s with error %s\n", ALTERSTR, errmsg);
+	}
+	else
+	{
+		TR1(("remove_segid_constraint: alter completed OK\n"));
+	}
+
+	PQclear(result);
+	PQfinish(conn);
+	return res;
+}
+
+apr_status_t gpdb_harvest_one(const char* table)
+{
+	return harvest(table, NULL, NULL);
+}
+
+
+
+apr_status_t truncate_file(char* fn, apr_pool_t* pool)
+{
+	apr_file_t *fp = NULL;
+	apr_status_t status;
+
+	status = apr_file_open(&fp, fn, APR_WRITE|APR_CREATE|APR_TRUNCATE, APR_UREAD|APR_UWRITE, pool);
+
+	if (status == APR_SUCCESS)
+	{
+		status = apr_file_trunc(fp, 0);
+		apr_file_close(fp);
+	}
+
+	if (status != APR_SUCCESS)
+	{
+		gpmon_warningx(FLINE, 0, "harvest process truncate file %s failed", fn);
+	}
+	else
+	{
+		TR1(("harvest truncated file %s: ok\n", fn));
+	}
+
+	return status;
+}
+
+
+/* rename tail to stage */
+static apr_status_t rename_tail_files(const char* tbl, apr_pool_t* pool, PGconn* conn)
+{
+	char srcfn[PATH_MAX];
+	char dstfn[PATH_MAX];
+
+	apr_status_t status;
+
+	/* make the file names */
+	snprintf(srcfn, PATH_MAX, "%s%s_tail.dat", GPMON_DIR, tbl);
+	snprintf(dstfn, PATH_MAX, "%s%s_stage.dat", GPMON_DIR, tbl);
+
+	status = apr_file_rename(srcfn, dstfn, pool);
+	if (status != APR_SUCCESS)
+	{
+		gpmon_warningx(FLINE, status, "harvest failed renaming %s to %s", srcfn, dstfn);
+		return status;
+	}
+	else
+	{
+		TR1(("harvest rename %s to %s success\n", srcfn, dstfn));
+	}
+
+	return status;
+}
+
+/* append stage data to _tail file */
+static apr_status_t append_to_harvest(const char* tbl, apr_pool_t* pool, PGconn* conn)
+{
+	char srcfn[PATH_MAX];
+	char dstfn[PATH_MAX];
+
+	apr_status_t status;
+
+	/* make the file names */
+	snprintf(srcfn, PATH_MAX, "%s%s_stage.dat", GPMON_DIR, tbl);
+	snprintf(dstfn, PATH_MAX, "%s_%s_tail.dat", GPMON_DIR, tbl);
+
+	status = apr_file_append(srcfn, dstfn, APR_FILE_SOURCE_PERMS, pool);
+	if (status != APR_SUCCESS)
+	{
+		gpmon_warningx(FLINE, status, "harvest failed appending %s to %s", srcfn, dstfn);
+	}
+	else
+	{
+		TR1(("harvest append %s to %s: ok\n", srcfn, dstfn));
+	}
+
+	return status;
+}
+
+typedef apr_status_t eachtablefunc(const char* tbl, apr_pool_t*, PGconn*);
+typedef apr_status_t eachtablefuncwithopt(const char* tbl, apr_pool_t*, PGconn*, mmon_options_t*);
+
+apr_status_t call_for_each_table(eachtablefunc, apr_pool_t*, PGconn*);
+apr_status_t call_for_each_table_with_opt(eachtablefuncwithopt, apr_pool_t*, PGconn*, mmon_options_t*);
+
+
+char* all_tables[] = { "system", "queries", "database", "segment", "diskspace" };
+
+apr_status_t call_for_each_table(eachtablefunc func, apr_pool_t* pool, PGconn* conn)
+{
+	apr_status_t status = APR_SUCCESS;
+	apr_status_t r;
+	int num_tables = sizeof(all_tables) / sizeof (char*);
+	int i;
+
+	for (i = 0; i < num_tables; ++i)
+	{
+		r = func(all_tables[i], pool, conn);
+		if (r != APR_SUCCESS)
+		{
+			status = r;
+		}
+	}
+
+	return status;
+}
+
+apr_status_t call_for_each_table_with_opt(eachtablefuncwithopt func, apr_pool_t* pool, PGconn* conn, mmon_options_t *opt)
+{
+	apr_status_t status = APR_SUCCESS;
+	apr_status_t r;
+	int num_tables = sizeof(all_tables) / sizeof (char*);
+	int i;
+
+	for (i = 0; i < num_tables; ++i)
+	{
+		r = func(all_tables[i], pool, conn, opt);
+		if (r != APR_SUCCESS)
+		{
+			status = r;
+		}
+	}
+
+	return status;
+}
+
+/* rename tail files to stage files */
+apr_status_t gpdb_rename_tail_files(apr_pool_t* pool)
+{
+	return call_for_each_table(rename_tail_files, pool, NULL);
+}
+
+/* copy data from stage files to harvest files */
+apr_status_t gpdb_copy_stage_to_harvest_files(apr_pool_t* pool)
+{
+	return call_for_each_table(append_to_harvest, pool, NULL);
+}
+
+/* truncate _tail files */
+apr_status_t empty_harvest_file(const char* tbl, apr_pool_t* pool, PGconn* conn)
+{
+	char fn[PATH_MAX];
+	snprintf(fn, PATH_MAX, "%s_%s_tail.dat", GPMON_DIR, tbl);
+	return truncate_file(fn, pool);
+}
+
+/* truncate tail files */
+apr_status_t truncate_tail_file(const char* tbl, apr_pool_t* pool, PGconn* conn)
+{
+	char fn[PATH_MAX];
+	snprintf(fn, PATH_MAX, "%s%s_tail.dat", GPMON_DIR, tbl);
+	return truncate_file(fn, pool);
+}
+
+/* truncate _tail files to clear data already loaded into the DB */
+apr_status_t gpdb_truncate_tail_files(apr_pool_t* pool)
+{
+	return call_for_each_table(truncate_tail_file, pool, NULL);
+}
+
+/* truncate _tail files to clear data already loaded into the DB */
+apr_status_t gpdb_empty_harvest_files(apr_pool_t* pool)
+{
+	return call_for_each_table(empty_harvest_file, pool, NULL);
+}
+
+/* insert _tail data into history table */
+apr_status_t gpdb_harvest(void)
+{
+	return call_for_each_table(harvest, NULL, NULL);
+}
+
+//static bool gpdb_insert_alert_log()
+//{
+//	PGconn* conn = 0;
+//	PGresult* result = 0;
+//	const char* QRY = "insert into log_alert_history select * from log_alert_tail;";
+//	const char* errmsg;
+//	errmsg = gpdb_exec(&conn, &result, QRY);
+//
+//	bool success = true;
+//	if (errmsg)
+//	{
+//		gpmon_warningx(
+//			FLINE, 0,
+//			"---- ARCHIVING HISTORICAL ALERT DATA FAILED ---- on query %s with error %s\n",
+//			QRY, errmsg);
+//		success = false;
+//	}
+//	else
+//	{
+//		TR1(("load completed OK: alert_log\n"));
+//	}
+//
+//	PQclear(result);
+//	PQfinish(conn);
+//	return success;
+//}
+
+//static void gpdb_remove_success_files(apr_array_header_t *success_append_files, apr_pool_t *pool)
+//{
+//	void *file_slot = NULL;
+//	while ((file_slot = apr_array_pop(success_append_files)))
+//	{
+//		const char *file_name = (*(char**)file_slot);
+//		if (file_name)
+//		{
+//			if (apr_file_remove(file_name, pool) != APR_SUCCESS)
+//			{
+//				gpmon_warningx(FLINE, 0, "failed removing file:%s", file_name);
+//			}
+//		}
+//	}
+//}
+
+//static int cmp_string(const void *left, const void *right)
+//{
+//	const char *op1 = *(const char**)left;
+//	const char *op2 = *(const char**)right;
+//	return strcmp(op1, op2);
+//}
+
+// Find all files start with 'gpdb-alert' under GPMON_LOG directory, sort it and
+// remove the latest one 'gpdb-alert-*.csv' as it is still used by GPDB.
+//static void get_alert_log_tail_files(apr_array_header_t *tail_files, apr_pool_t *pool)
+//{
+//	apr_dir_t *dir;
+//	apr_status_t status = apr_dir_open(&dir, GPMON_LOG, pool);
+//	if (status != APR_SUCCESS)
+//	{
+//		gpmon_warningx(FLINE, status, "failed opening directory:%s", GPMON_LOG);
+//		return;
+//	}
+//
+//	apr_finfo_t dirent;
+//	static const char gpdb_prefix[] = "gpdb-alert";
+//	while (apr_dir_read(&dirent, APR_FINFO_DIRENT, dir) == APR_SUCCESS)
+//	{
+//		if (strncmp(dirent.name, gpdb_prefix, sizeof(gpdb_prefix) - 1) == 0)
+//		{
+//			void *file_slot = apr_array_push(tail_files);
+//			if (! file_slot)
+//			{
+//				gpmon_warningx(FLINE, 0, "failed getting alert tail log:%s due to out of memory", dirent.name);
+//				continue;
+//			}
+//			(*(const char**)file_slot) = apr_pstrcat(pool, GPMON_LOG, "/", dirent.name, NULL);
+//		}
+//	}
+//
+//	// We only want to use qsort in stdlib.h, not the macro qsort in port.h.
+//	(qsort)(tail_files->elts, tail_files->nelts, tail_files->elt_size, cmp_string);
+//	(void)apr_array_pop(tail_files);
+//	apr_dir_close(dir);
+//}
+
+/* gp_elog has been moved */
+/*
+void gpdb_import_alert_log(apr_pool_t *pool)
+{
+	// Get alert log files to be imported.
+	apr_array_header_t* tail_files = apr_array_make(pool, 10, sizeof(char*));
+	apr_array_header_t* success_append_files = apr_array_make(pool, 10, sizeof(char*));
+	get_alert_log_tail_files(tail_files, pool);
+
+	// Create or truncate stage file.
+	char *dst_file = apr_pstrcat(pool, GPMON_LOG, "/", GPMON_ALERT_LOG_STAGE, NULL);
+	apr_status_t status = truncate_file(dst_file, pool);
+	if (status != APR_SUCCESS)
+	{
+	    gpmon_warningx(FLINE, 0, "failed truncating stage file:%s", dst_file);
+	    return;
+	}
+
+	// Append alert log tail file to stage file
+	void *tail_file = NULL;
+	while ((tail_file = apr_array_pop(tail_files)))
+	{
+		char *filename = *(char**)tail_file;
+		void *success_file_slot = apr_array_push(success_append_files);
+		if (!success_file_slot)
+		{
+			gpmon_warningx(
+				FLINE, 0, "failed appending file:%s to stage file:%s due to out of memory",
+				filename, dst_file);
+			continue;
+		}
+		(*(char**)success_file_slot) = NULL;
+
+	    status = apr_file_append(filename, dst_file, APR_FILE_SOURCE_PERMS, pool);
+	    if (status != APR_SUCCESS)
+	    {
+			gpmon_warningx(FLINE, status, "failed appending file:%s to stage file:%s", filename, dst_file);
+			continue;
+		}
+	    else
+	    {
+			(*(char**)success_file_slot) = filename;
+	    	TR1(("success appending file:%s to stage file:%s\n", filename, dst_file));
+	    }
+	}
+
+	// Insert tail file to history table.
+	if (!gpdb_insert_alert_log())
+	{
+		// Failure might happen on malformed log entries
+		time_t now;
+		char timestr[20];
+		char *bad_file;
+
+		// Copy failed log into separate file for user attention
+		now = time(NULL);
+		strftime(timestr, 20, "%Y-%m-%d_%H%M%S", localtime(&now));
+		bad_file = apr_pstrcat(pool, GPMON_LOG, "/", GPMON_ALERT_LOG_STAGE, "_broken_", timestr,  NULL);
+		if (apr_file_copy(dst_file, bad_file, APR_FPROT_FILE_SOURCE_PERMS, pool) == APR_SUCCESS)
+		{
+			gpmon_warningx(FLINE, status, "Staging file with broken entries is archived to %s", bad_file);
+		}
+		else
+		{
+			gpmon_warningx(FLINE, status, "failed copying stage file:%s to broken file:%s", dst_file, bad_file);
+		}
+	}
+
+	// Delete tail file regardless of load success, as keeping too many tail files
+	// might cause serious harm to the system
+	gpdb_remove_success_files(success_append_files, pool);
+	truncate_file(dst_file, pool);
+}
+*/
+
+
+/* insert _tail data into history table */
+apr_status_t gpdb_check_partitions(mmon_options_t *opt)
+{
+	apr_status_t result;
+
+	PGconn *conn = NULL;
+	conn = PQconnectdb(GPDB_CONNECTION_STRING);
+
+	if (PQstatus(conn) != CONNECTION_OK) {
+		gpmon_warning(
+				FLINE,
+				"error creating GPDB client connection to dynamically "
+						"check/create gpperfmon partitions: %s",
+				PQerrorMessage(conn));
+		result = APR_EINVAL;
+	} else {
+		result = call_for_each_table_with_opt(check_partition, NULL, conn, opt);
+
+		// make sure to run check_partition even if we just got a failure from the previous call
+		apr_status_t temp_result;
+		temp_result = check_partition("log_alert", NULL, conn, opt);
+
+		// use the first error that occurred, if any
+		if (result == APR_SUCCESS) {
+			result = temp_result;
+		}
+	}
+
+	// close connection
+	PQfinish(conn);
+	return result;
+}
+
+static void convert_tuples_to_hash(PGresult *result, apr_hash_t *hash, apr_pool_t *pool)
+{
+	int rowcount = PQntuples(result);
+	int i = 0;
+	for (; i < rowcount; i++)
+	{
+		char* sessid = PQgetvalue(result, i, 0);
+		char* query  = PQgetvalue(result, i, 1);
+
+		char *sessid_copy = apr_pstrdup(pool, sessid);
+		char *query_copy  = apr_pstrdup(pool, query);
+		if (sessid_copy == NULL || query_copy == NULL)
+		{
+			gpmon_warning(FLINE, "Out of memory");
+			continue;
+		}
+		apr_hash_set(hash, sessid_copy, APR_HASH_KEY_STRING, query_copy);
+	}
+}
+
+apr_hash_t *get_active_queries(apr_pool_t *pool)
+{
+	PGresult   *result = NULL;
+	apr_hash_t *active_query_tab = NULL;
+
+	PGconn *conn = PQconnectdb(GPDB_CONNECTION_STRING);
+	if (PQstatus(conn) != CONNECTION_OK)
+	{
+		gpmon_warning(
+			FLINE,
+			"error creating GPDB client connection to dynamically "
+			"check/create gpperfmon partitions: %s",
+		PQerrorMessage(conn));
+		PQfinish(conn);
+		return NULL;
+	}
+
+	const char *qry= "SELECT sess_id, query FROM pg_stat_activity;";
+	const char *errmsg = gpdb_exec_only(conn, &result, qry);
+	if (errmsg)
+	{
+		gpmon_warning(FLINE, "check query status failed : %s", errmsg);
+	}
+	else
+	{
+		active_query_tab = apr_hash_make(pool);
+		if (! active_query_tab)
+		{
+			gpmon_warning(FLINE, "Out of memory");
+		}
+		else
+		{
+			convert_tuples_to_hash(result, active_query_tab, pool);
+		}
+	}
+
+	PQclear(result);
+	PQfinish(conn);
+
+	return active_query_tab;
+}
+
+const char *iconv_encodings[] = {
+	NULL, // SQL_ASCII, not supported as server encoding.
+	"EUC-JP",
+	"EUC-CN",
+	"EUC-KR",
+	"EUC-TW",
+	"EUC-JISX0213",
+	"UTF8",
+	NULL, // MULE_INTERNAL, not supported in iconv.
+	"LATIN1",
+	"LATIN2",
+	"LATIN3",
+	"LATIN4",
+	"LATIN5",
+	"LATIN6",
+	"LATIN7",
+	"LATIN8",
+	"LATIN9",
+	"LATIN10",
+	"WINDOWS-1256",
+	"WINDOWS-1258",
+	NULL, // WIN866, not supported in iconv.
+	"WINDOWS-874",
+	"KOI8-R",
+	"WINDOWS-1251",
+	"WINDOWS-1252",
+	"ISO_8859-5",
+	"ISO_8859-6",
+	"ISO_8859-7",
+	"ISO_8859-8",
+	"WINDOWS-1250",
+	"WINDOWS-1253",
+	"WINDOWS-1254",
+	"WINDOWS-1255",
+	"WINDOWS-1257",
+	"KOI8-U",
+	"SJIS",
+	NULL, // BIG5, not supported in server encoding.
+	NULL, // GBK, not supported in server encoding.
+	NULL, // UHC, not supported in server encoding.
+	NULL, // GB18030, not supported in server encoding.
+	"JOHAB",
+	NULL // SJIS, not supported in server encoding.
+};
+
+static const char* find_encoding(int encoding_num)
+{
+	// Because encodings here are in consistant with those
+	// in gpdb, we have assertion here.
+	ASSERT(encoding_num >= 0 &&
+		   encoding_num < (sizeof(iconv_encodings) / sizeof(char*)));
+	return iconv_encodings[encoding_num];
+}
+
+static bool get_encoding_from_result(PGresult	*result,
+									 char		*encoding,
+									 size_t		encoding_len,
+									 int		*encoding_num)
+{
+	ASSERT(result);
+	ASSERT(encoding);
+	ASSERT(encoding_num);
+	if (PQntuples(result) > 0)
+	{
+		const char* encoding_str = PQgetvalue(result, 0, 0);
+		*encoding_num = atoi(encoding_str);
+		const char *encoding_item = find_encoding(*encoding_num);
+		if (encoding_item)
+		{
+			strncpy(encoding, encoding_item, encoding_len);
+		}
+		else
+		{
+			gpmon_warning(FLINE, "GPDB bad encoding: %d\n", *encoding_num);
+			return false;
+		}
+	}
+	else
+	{
+		TR0(("could not find owner for 'gpperfmon' database\n"));
+		return false;
+	}
+	return true;
+}
+
+static bool gpdb_get_server_encoding(PGconn	*conn,
+									 char	*encoding,
+									 size_t	encoding_len,
+									 int	*encoding_num)
+{
+	ASSERT(conn);
+	ASSERT(encoding);
+	ASSERT(encoding_num);
+
+	PGresult *result = NULL;
+	const char *query = "SELECT encoding FROM pg_catalog.pg_database "
+						"d WHERE d.datname = 'gpperfmon'";
+	const char* errmsg = gpdb_exec_only(conn, &result, query);
+	bool ret = true;
+
+	if (errmsg != NULL)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, query);
+		ret = false;
+	}
+	else
+	{
+		ret = get_encoding_from_result(result,
+									   encoding,
+									   encoding_len,
+									   encoding_num);
+	}
+
+	PQclear(result);
+	return ret;
+}
+
+static bool create_alert_table_with_script(PGconn *conn, const char *encoding)
+{
+	ASSERT(conn);
+	ASSERT(encoding);
+	const char query_pattern[] = "BEGIN;"
+		"DROP EXTERNAL TABLE IF EXISTS public.log_alert_tail;"
+		"CREATE EXTERNAL WEB TABLE public.log_alert_tail (LIKE "
+		"public.log_alert_history) EXECUTE 'gpperfmoncat.sh "
+		"gpperfmon/logs/alert_log_stage 2> /dev/null || true' "
+		"ON MASTER FORMAT 'csv' (DELIMITER e',' NULL e'' "
+		"ESCAPE e'\"' QUOTE e'\"') ENCODING '%s';"
+		"DROP EXTERNAL TABLE IF EXISTS public.log_alert_now;"
+		"CREATE EXTERNAL WEB TABLE public.log_alert_now "
+		"(LIKE public.log_alert_history) "
+		"EXECUTE 'gpperfmoncat.sh gpperfmon/logs/*.csv 2> /dev/null "
+		"|| true' ON MASTER FORMAT 'csv' (DELIMITER e',' NULL "
+		"e'' ESCAPE e'\"' QUOTE e'\"') ENCODING '%s'; COMMIT;";
+
+	char query[sizeof(query_pattern) + 100];
+	snprintf(query, sizeof(query), query_pattern, encoding, encoding);
+
+	return gpdb_exec_ddl(conn, query);
+}
+
+static bool create_alert_table_without_script(PGconn *conn, const char *encoding)
+{
+	ASSERT(conn);
+	ASSERT(encoding);
+	const char query_pattern[] = "BEGIN;"
+		"DROP EXTERNAL TABLE IF EXISTS public.log_alert_tail;"
+		"CREATE EXTERNAL WEB TABLE public.log_alert_tail (LIKE "
+		"public.log_alert_history) EXECUTE 'iconv -f %s -t %s -c "
+		"gpperfmon/logs/alert_log_stage 2> /dev/null || true' ON MASTER FORMAT "
+		"'csv' (DELIMITER e',' NULL e'' ESCAPE e'\"' QUOTE e'\"') ENCODING '%s';"
+		"DROP EXTERNAL TABLE IF EXISTS public.log_alert_now;"
+		"CREATE EXTERNAL WEB TABLE public.log_alert_now (LIKE "
+		"public.log_alert_history) EXECUTE 'iconv -f %s -t %s -c "
+		"gpperfmon/logs/*.csv 2> /dev/null || true' ON MASTER FORMAT 'csv' "
+		"(DELIMITER e',' NULL e'' ESCAPE e'\"' QUOTE e'\"') ENCODING '%s'; COMMIT;";
+
+	char query[sizeof(query_pattern) + 100];
+	snprintf(query, sizeof(query), query_pattern, encoding, encoding,
+			 encoding, encoding, encoding, encoding);
+	return gpdb_exec_ddl(conn, query);
+}
+
+static bool recreate_alert_tables_if_needed(PGconn *conn, const char *owner)
+{
+	ASSERT(conn);
+
+	const int max_encoding_length = 20;
+	char encoding[max_encoding_length];
+	int encoding_num;
+	bool success_get_encoding = gpdb_get_server_encoding(conn,
+														 encoding,
+														 sizeof(encoding),
+														 &encoding_num);
+	if (!success_get_encoding)
+	{
+		gpmon_warning(FLINE, "GPDB failed to get server encoding.\n");
+		return false;
+	}
+
+	bool script_exist = (system("which gpperfmoncat.sh > /dev/null 2>&1") == 0);
+	bool should_recreate = gpdb_should_recreate_log_alert(
+														conn,
+														"log_alert_tail",
+														encoding,
+														encoding_num,
+														script_exist);
+	if (should_recreate)
+	{
+		return (script_exist ?
+				create_alert_table_with_script(conn, encoding) :
+				create_alert_table_without_script(conn, encoding));
+	}
+
+	return true;
+}
+
+static bool gpdb_get_gpperfmon_owner(PGconn *conn, char *owner, size_t owner_length)
+{
+	ASSERT(conn);
+	ASSERT(owner);
+
+	PGresult *result = NULL;
+	const char *query = "select pg_catalog.pg_get_userbyid(d.datdba) as "
+						"owner from pg_catalog.pg_database d where "
+						"d.datname = 'gpperfmon'";
+	const char *errmsg = gpdb_exec_only(conn, &result, query);
+	bool ret = true;
+
+	if (errmsg != NULL)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, query);
+		ret = false;
+	}
+	else
+	{
+		if (PQntuples(result) > 0)
+		{
+			const char* owner_field = PQgetvalue(result, 0, 0);
+			strncpy(owner, owner_field, owner_length);
+			ret = true;
+		}
+		else
+		{
+			TR0(("could not find owner for 'gpperfmon' database\n"));
+			ret = false;
+		}
+	}
+	PQclear(result);
+	return ret;
+}
+
+static void gpdb_change_alert_table_owner(PGconn *conn, const char *owner)
+{
+	ASSERT(conn);
+	ASSERT(owner);
+
+	// change owner from gpmon, otherwise, gpperfmon_install
+	// might quit with error when execute 'drop role gpmon if exists'
+	const char* query_pattern = "ALTER TABLE public.log_alert_history owner to %s;"
+								"ALTER EXTERNAL TABLE public.log_alert_tail "
+								"owner to %s;ALTER EXTERNAL TABLE public.log_alert_now"
+								" owner to %s;";
+	const int querybufsize = 512;
+	char query[querybufsize];
+	snprintf(query, querybufsize, query_pattern, owner, owner, owner);
+	TR0(("change owner to %s\n", owner));
+	gpdb_exec_ddl(conn, query);
+}
+
+/*
+ * Upgrade: alter distributed key of log_alert_history from logsegment to logtime
+ */
+void upgrade_log_alert_table_distributed_key(PGconn* conn)
+{
+	if (conn == NULL)
+	{
+		TR0(("Can't upgrade log_alert_history: conn is NULL\n"));
+		return;
+	}
+
+	const char* qry = "SELECT d.nspname||'.'||a.relname as tablename, b.attname as distributed_key\
+	    FROM pg_class  a\
+	    INNER JOIN pg_attribute b on a.oid=b.attrelid\
+	    INNER JOIN gp_distribution_policy c on a.oid = c.localoid\
+	    INNER JOIN pg_namespace d on a.relnamespace = d.oid\
+	    WHERE a.relkind = 'r' AND b.attnum = any(c.distkey) AND a.relname = 'log_alert_history'";
+
+	PGresult* result = NULL;
+	const char* errmsg = gpdb_exec_only(conn, &result, qry);
+
+	if (errmsg != NULL)
+	{
+		gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, qry);
+	}
+	else
+	{
+		if (PQntuples(result) > 0)
+		{
+			// check if current distributed key is logsegment
+			const char* current_distributed_key = PQgetvalue(result, 0, 1);
+			if (current_distributed_key == NULL)
+			{
+				TR0(("could not find distributed key of log_alert_history\n"));
+				PQclear(result);
+				return;
+			}
+			if (strcmp(current_distributed_key, "logsegment") == 0)
+			{
+				TR0(("[INFO] log_alert_history: Upgrading log_alert_history table to use logsegment as distributed key\n"));
+				qry = "alter table public.log_alert_history set distributed by (logtime);";
+				gpdb_exec_ddl(conn, qry);
+			}
+		}
+	}
+
+	PQclear(result);
+	return;
+}
+
+
+
+// to mitigate upgrade hassle.
+void create_log_alert_table()
+{
+	PGconn *conn = PQconnectdb(GPDB_CONNECTION_STRING);
+	if (PQstatus(conn) != CONNECTION_OK)
+	{
+		gpmon_warning(FLINE,
+			"error creating gpdb client connection to dynamically "
+			"check/create gpperfmon partitions: %s",
+		PQerrorMessage(conn));
+		PQfinish(conn);
+		return;
+	}
+
+	const char *qry= "SELECT tablename FROM pg_tables "
+					"WHERE tablename = 'log_alert_history' "
+					"AND schemaname = 'public' ;";
+
+	const bool has_history_table = gpdb_exec_search_for_at_least_one_row(qry, conn);
+
+	char owner[MAX_OWNER_LENGTH] = {};
+	bool success_get_owner = gpdb_get_gpperfmon_owner(conn, owner, sizeof(owner));
+
+	// log_alert_history: create table if not exist or alter it to use correct
+	// distribution key.
+	if (!has_history_table)
+	{
+		qry = "BEGIN; CREATE TABLE public.log_alert_history (LIKE "
+			"gp_toolkit.__gp_log_master_ext) DISTRIBUTED BY (logtime) "
+			"PARTITION BY range (logtime)(START (date '2010-01-01') "
+			"END (date '2010-02-01') EVERY (interval '1 month')); COMMIT;";
+
+		TR0(("sounds like you have just upgraded your database, creating"
+			 " newer tables\n"));
+
+		gpdb_exec_ddl(conn, qry);
+	}
+	else
+	{
+		/*
+		* Upgrade: alter distributed key of log_alert_history from logsegment to logtime
+		*/
+		upgrade_log_alert_table_distributed_key(conn);
+	}
+
+	// log_alert_now/log_alert_tail: change to use 'gpperfmoncat.sh' from 'iconv/cat' to handle
+	// encoding issue.
+	if (recreate_alert_tables_if_needed(conn, owner))
+	{
+		if (success_get_owner)
+		{
+			gpdb_change_alert_table_owner(conn, owner);
+		}
+	}
+	else
+	{
+		TR0(("recreate alert_tables failed\n"));
+	}
+
+	PQfinish(conn);
+	return;
+}
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.h b/contrib/perfmon/src/gpmmon/gpmondb.h
new file mode 100644
index 00000000000..ffd732a8fef
--- /dev/null
+++ b/contrib/perfmon/src/gpmmon/gpmondb.h
@@ -0,0 +1,97 @@
+#ifndef GPMONDB_H
+#define GPMONDB_H
+
+#include "apr_general.h"
+#include "apr_md5.h"
+#include "apr_hash.h"
+#include "gpmonlib.h"
+
+/**
+ * Validate the the gpperfmon database is correct and
+ * gpmon user has correct access.
+ */
+APR_DECLARE(int) gpdb_validate_gpperfmon(void);
+
+/**
+ * Check if gpperfmon database exists.
+ */
+APR_DECLARE(Oid) gpdb_gpperfmon_dbid(void);
+
+/**
+ * Check if gpmon user has access to ext tables
+ */
+APR_DECLARE(int) gpdb_validate_ext_table_access(void);
+
+/**
+ * Check if perfmon is enabled
+ */
+APR_DECLARE(int) gpdb_gpperfmon_enabled(void);
+
+/**
+ *  Retrieve a list of all hosts in the GPDB.
+ *  @param hostcnt return # elements in hostvec
+ *  @param hostvec return array of hostnames.
+ *  @param pool where to allocate hostvec and its contents.
+ */
+APR_DECLARE(void) gpdb_get_hostlist(int* hostcnt, host_t** host_table, apr_pool_t* global_pool, mmon_options_t* opt);
+
+/**
+ *  Get the master data directory in the GPDB.
+ *  @param mstrdir return the master data directory
+ *  @param hostname return the master hostname
+ *  @param pool where to allocate hostname and mstrdir
+ */
+APR_DECLARE(void) gpdb_get_master_data_dir(char** hostname, char** mstrdir, apr_pool_t* pool);
+
+/**
+ * Find out all read only alert tail files, which don't include the one being written by
+ * gpdb currently. Merge them into stage file, and load into log_alert_history table,
+ * and remove them when successful.
+ */
+APR_DECLARE(void) gpdb_import_alert_log(apr_pool_t* pool);
+
+/**
+ * check if new historical partitions are required and create them
+ */
+APR_DECLARE(apr_status_t) gpdb_check_partitions(mmon_options_t *opt);
+
+/**
+ * insert _tail data into history table
+ */
+APR_DECLARE(apr_status_t) gpdb_harvest(void);
+
+/**
+ * truncate _tail files to clear data already loaded into the DB
+ */
+APR_DECLARE( apr_status_t) gpdb_empty_harvest_files(apr_pool_t* pool);
+
+/**
+ * rename tail to stage files to allow continuous reading and allow new data to go into tail files
+ */
+APR_DECLARE (apr_status_t) gpdb_rename_tail_files(apr_pool_t* pool);
+
+/**
+ * add the data from the stage files into the harvest (_tail) files for loading into history
+ */
+APR_DECLARE (apr_status_t) gpdb_copy_stage_to_harvest_files(apr_pool_t* pool);
+
+/**
+ * restart with empty tail files
+ */
+APR_DECLARE (apr_status_t) gpdb_truncate_tail_files(apr_pool_t* pool);
+
+APR_DECLARE (apr_status_t) gpdb_harvest_one(const char* table);
+
+APR_DECLARE (apr_status_t) remove_segid_constraint(void);
+
+APR_DECLARE (apr_hash_t *) get_active_queries(apr_pool_t* pool);
+
+APR_DECLARE (void) create_log_alert_table(void);
+
+int find_token_in_config_string(char*, char**, const char*);
+void process_line_in_hadoop_cluster_info(apr_pool_t*, apr_hash_t*, char*, char*, char*);
+int get_hadoop_hosts_and_add_to_hosts(apr_pool_t*, apr_hash_t*, mmon_options_t*);
+apr_status_t truncate_file(char*, apr_pool_t*);
+
+#endif /* GPMONDB_H */
+
diff --git a/contrib/perfmon/src/gpmon/Makefile b/contrib/perfmon/src/gpmon/Makefile
new file mode 100644
index 00000000000..777183f07de
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/Makefile
@@ -0,0 +1,15 @@
+top_builddir = ../../../../
+
+MODULE_big = gpmon
+OBJS = gpmon.o
+PG_CPPFLAGS = -I../include
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/perfmon/src/gpmmon
+include $(top_builddir)/src/Makefile.global
+include $(top_builddir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
new file mode 100644
index 00000000000..9f0ffcc8388
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -0,0 +1,512 @@
+#include "postgres.h"
+#include "c.h"
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#ifdef WIN32
+#include <io.h>
+#endif
+#include "libpq/pqsignal.h"
+#include "gpmon.h"
+
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+#include "access/xact.h"
+#include "cdb/cdbtm.h"
+#include "cdb/cdbvars.h"
+#include "executor/executor.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "utils/metrics_utils.h"
+#include "utils/metrics_utils.h"
+#include "utils/snapmgr.h"
+
+PG_MODULE_MAGIC;
+static int32 init_tmid = -1;;
+
+void _PG_init(void);
+void _PG_fini(void);
+
+/* Extern stuff */
+extern char *get_database_name(Oid dbid);
+
+static void gpmon_record_kv_with_file(const char* key,
+				  const char* value,
+				  bool extraNewLine,
+				  FILE* fp);
+static void gpmon_record_update(int32 tmid, int32 ssid,
+								int32 ccnt, int32 status);
+static const char* gpmon_null_subst(const char* input);
+
+/* gpmon hooks */
+static query_info_collect_hook_type prev_query_info_collect_hook = NULL;
+
+static void gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc);
+
+static gpmon_packet_t* gpmon_qlog_packet_init();
+static void init_gpmon_hooks(void);
+
+struct  {
+    int    gxsock;
+	pid_t  pid;
+	struct sockaddr_in gxaddr;
+} gpmon = {0};
+
+int64 gpmon_tick = 0;
+
+void gpmon_sig_handler(int sig);
+
+void gpmon_sig_handler(int sig) 
+{
+	gpmon_tick++;
+}
+
+
+void gpmon_init(void)
+{
+//	struct itimerval tv;
+#ifndef WIN32
+	pqsigfunc sfunc;
+#endif
+	pid_t pid = getpid();
+	int sock;
+
+	if (pid == gpmon.pid)
+		return;
+#ifndef WIN32
+	sfunc = pqsignal(SIGVTALRM, gpmon_sig_handler);
+	if (sfunc == SIG_ERR) {
+		elog(WARNING, "[perfmon]: unable to set signal handler for SIGVTALRM (%m)");
+	}
+	else if (sfunc == gpmon_sig_handler) {
+		close(gpmon.gxsock); 
+		gpmon.gxsock = -1;
+	}
+	else {
+		Assert(sfunc == 0);
+	}
+#endif
+
+//	/*TODO: what exactly perfmon_send_interval does? */
+//	tv.it_interval.tv_sec = perfmon_send_interval;
+//	//tv.it_interval.tv_sec = 5;
+//	tv.it_interval.tv_usec = 0;
+//	tv.it_value = tv.it_interval;
+//#ifndef WIN32
+//	if (-1 == setitimer(ITIMER_VIRTUAL, &tv, 0)) {
+//		elog(WARNING, "[perfmon]: unable to start timer (%m)");
+//	}
+//#endif 
+//
+	sock = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sock == -1) {
+		elog(WARNING, "[perfmon]: cannot create socket (%m)");
+	}
+#ifndef WIN32
+    if (fcntl(sock, F_SETFL, O_NONBLOCK) == -1) {
+		elog(WARNING, "[perfmon] fcntl(F_SETFL, O_NONBLOCK) failed");
+    }
+    if (fcntl(sock, F_SETFD, 1) == -1) {
+		elog(WARNING, "[perfmon] fcntl(F_SETFD) failed");
+    }
+#endif 
+	gpmon.gxsock = sock;
+	memset(&gpmon.gxaddr, 0, sizeof(gpmon.gxaddr));
+	gpmon.gxaddr.sin_family = AF_INET;
+	gpmon.gxaddr.sin_addr.s_addr = inet_addr("127.0.0.1");
+	gpmon.gxaddr.sin_port = htons(perfmon_port);
+	gpmon.pid = pid;
+}
+
+static void gpmon_record_kv_with_file(const char* key,
+				  const char* value,
+				  bool extraNewLine,
+				  FILE* fp)
+{
+	int len = strlen(value);
+
+	fprintf(fp, "%d %s\n", len, key);
+	fwrite(value, 1, len, fp);
+	fprintf(fp, "\n");
+
+	if (extraNewLine)
+	{
+		fprintf(fp, "\n");
+	}
+}
+
+void gpmon_record_update(int32 tmid, int32 ssid, int32 ccnt,
+						 int32 status)
+{
+	char fname[GPMON_DIR_MAX_PATH];
+	FILE *fp;
+
+	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, tmid, ssid, ccnt);
+
+	fp = fopen(fname, "r+");
+
+	if (!fp)
+		return;
+
+	if (0 == fseek(fp, -1, SEEK_END))
+	{
+		fprintf(fp, "%d", status);
+	}
+	fclose(fp);
+}
+
+void gpmon_gettmid(int32* tmid)
+{
+	Assert(init_tmid > -1);
+	*tmid = init_tmid;
+} 
+
+
+void gpmon_send(gpmon_packet_t* p)
+{
+	if (p->magic != GPMON_MAGIC)  {
+		elog(WARNING, "[perfmon] - bad magic %x", p->magic);
+		return;
+	}
+
+
+	if (p->pkttype == GPMON_PKTTYPE_QEXEC) {
+		elog(DEBUG1,
+				"[perfmon] Perfmon Executor Packet: (tmid, ssid, ccnt, segid, pid, nid, status) = "
+				"(%d, %d, %d, %d, %d, %d, %d)",
+				p->u.qexec.key.tmid, p->u.qexec.key.ssid, p->u.qexec.key.ccnt,
+				p->u.qexec.key.hash_key.segid, p->u.qexec.key.hash_key.pid, p->u.qexec.key.hash_key.nid,
+				p->u.qexec.status);
+	}
+	
+	if (gpmon.gxsock > 0) {
+		int n = sizeof(*p);
+		if (n != sendto(gpmon.gxsock, (const char *)p, n, 0, 
+						(struct sockaddr*) &gpmon.gxaddr, 
+						sizeof(gpmon.gxaddr))) {
+			elog(LOG, "[perfmon]: cannot send (%m socket %d)", gpmon.gxsock);
+		}
+	}
+}
+
+#define GPMON_QLOG_PACKET_ASSERTS(gpmonPacket) \
+		Assert(perfmon_enabled && Gp_role == GP_ROLE_DISPATCH); \
+		Assert(gpmonPacket); \
+		Assert(gpmonPacket->magic == GPMON_MAGIC); \
+		Assert(gpmonPacket->version == GPMON_PACKET_VERSION); \
+		Assert(gpmonPacket->pkttype == GPMON_PKTTYPE_QLOG)
+
+/**
+ * Create and init a qlog packet
+ *
+ * It is called by gpmon_query_info_collect_hook each time
+ * gpsmon and gpmmon will merge the packets with the same
+ * key together in 'update_qlog'
+ */
+static gpmon_packet_t*
+gpmon_qlog_packet_init()
+{
+	const char *username = NULL;
+	gpmon_packet_t *gpmonPacket = NULL;
+	gpmonPacket = (gpmon_packet_t *) palloc(sizeof(gpmon_packet_t));
+	memset(gpmonPacket, 0, sizeof(gpmon_packet_t));
+
+	Assert(perfmon_enabled && Gp_role == GP_ROLE_DISPATCH);
+	Assert(gpmonPacket);
+	
+	gpmonPacket->magic = GPMON_MAGIC;
+	gpmonPacket->version = GPMON_PACKET_VERSION;
+	gpmonPacket->pkttype = GPMON_PKTTYPE_QLOG;
+	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_SILENT;
+
+	gpmon_gettmid(&gpmonPacket->u.qlog.key.tmid);
+	gpmonPacket->u.qlog.key.ssid = gp_session_id;
+	gpmonPacket->u.qlog.pid = MyProcPid;
+
+
+	username = GetConfigOption("session_authorization", false, false); /* does not have to be freed */
+	/* User Id.  We use session authorization_string (so to make sense with session id) */
+	snprintf(gpmonPacket->u.qlog.user, sizeof(gpmonPacket->u.qlog.user), "%s",
+			username ? username : "");
+	gpmonPacket->u.qlog.dbid = MyDatabaseId;
+
+	/* Fix up command count */
+	gpmonPacket->u.qlog.key.ccnt = gp_command_count;
+	return gpmonPacket;
+}
+
+/**
+ * Call this method when query is submitted.
+ */
+void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket)
+{
+	struct timeval tv;
+
+
+	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
+
+	gettimeofday(&tv, 0);
+	
+	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_SUBMIT;
+	gpmonPacket->u.qlog.tsubmit = tv.tv_sec;
+	
+	//gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
+	//		gpmonPacket->u.qlog.key.ssid,
+	//		gpmonPacket->u.qlog.key.ccnt,
+	//		gpmonPacket->u.qlog.status);
+	//
+	gpmon_send(gpmonPacket);
+}
+
+/**
+ * Wrapper function that returns string if not null. Returns GPMON_UNKNOWN if it is null.
+ */
+static const char* gpmon_null_subst(const char* input)
+{
+	return input ? input : GPMON_UNKNOWN;
+}
+
+
+/**
+ * Call this method to let gpmon know the query text, application name, resource queue name and priority
+ * at submit time. It writes 4 key value pairs using keys: qtext, appname, resqname and priority using
+ * the format as described as below:
+ * This method adds a key-value entry to the gpmon text file. The format it uses is:
+ * <VALUE_LENGTH> <KEY>\n
+ * <VALUE>\n
+ * Boolean value extraByte indicates whether an additional newline is desired. This is
+ * necessary because gpmon overwrites the last byte to indicate status.
+ */
+
+void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
+		const char *queryText,
+		const char *appName,
+		const char *resqName,
+		const char *resqPriority)
+{
+	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
+	char fname[GPMON_DIR_MAX_PATH];
+	FILE* fp;
+
+	queryText = gpmon_null_subst(queryText);
+	appName = gpmon_null_subst(appName);
+	resqName = gpmon_null_subst(resqName);
+	resqPriority = gpmon_null_subst(resqPriority);
+
+	Assert(queryText);
+	Assert(appName);
+	Assert(resqName);
+	Assert(resqPriority);
+
+
+	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, 
+										gpmonPacket->u.qlog.key.tmid,
+										gpmonPacket->u.qlog.key.ssid,
+										gpmonPacket->u.qlog.key.ccnt);
+
+	fp = fopen(fname, "a");
+	if (!fp)
+		return;
+	gpmon_record_kv_with_file("qtext", queryText, false, fp);
+
+	gpmon_record_kv_with_file("appname", appName, false, fp);
+
+	gpmon_record_kv_with_file("resqname", resqName, false, fp);
+
+	gpmon_record_kv_with_file("priority", resqPriority, true, fp);
+
+	fprintf(fp, "%d", GPMON_QLOG_STATUS_SUBMIT);
+	fclose(fp);
+
+}
+
+/**
+ * Call this method when query starts executing.
+ */
+void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket)
+{
+	struct timeval tv;
+
+	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
+
+	gettimeofday(&tv, 0);
+	
+	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_START;
+	gpmonPacket->u.qlog.tstart = tv.tv_sec;
+	
+	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
+			gpmonPacket->u.qlog.key.ssid,
+			gpmonPacket->u.qlog.key.ccnt,
+			gpmonPacket->u.qlog.status);
+	
+	gpmon_send(gpmonPacket);
+}
+
+/**
+ * Call this method when query finishes executing.
+ */
+void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket)
+{
+	struct timeval tv;
+
+	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
+	gettimeofday(&tv, 0);
+	
+	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_DONE;
+	gpmonPacket->u.qlog.tfin = tv.tv_sec;
+	
+	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
+			gpmonPacket->u.qlog.key.ssid,
+			gpmonPacket->u.qlog.key.ccnt,
+			gpmonPacket->u.qlog.status);
+	
+	gpmon_send(gpmonPacket);
+}
+
+/**
+ * Call this method when query errored out.
+ */
+void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket)
+{
+	struct timeval tv;
+
+	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
+
+	gettimeofday(&tv, 0);
+	
+	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_ERROR;
+	gpmonPacket->u.qlog.tfin = tv.tv_sec;
+	
+	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
+			gpmonPacket->u.qlog.key.ssid,
+			gpmonPacket->u.qlog.key.ccnt,
+			gpmonPacket->u.qlog.status);
+	
+	gpmon_send(gpmonPacket);
+}
+
+/*
+ * gpmon_qlog_query_canceling
+ *    Record that the query is being canceled.
+ */
+void
+gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket)
+{
+	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
+	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_CANCELING;
+	
+	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
+			gpmonPacket->u.qlog.key.ssid,
+			gpmonPacket->u.qlog.key.ccnt,
+			gpmonPacket->u.qlog.status);
+	
+	gpmon_send(gpmonPacket);
+}
+
+static void 
+gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
+{
+	char *query_text;
+	QueryDesc *qd = (QueryDesc *)queryDesc;
+	if (perfmon_enabled
+			&& Gp_role == GP_ROLE_DISPATCH && qd != NULL)
+	{
+		gpmon_packet_t *gpmonPacket = NULL;
+		PG_TRY();
+		{
+			gpmonPacket = gpmon_qlog_packet_init();
+			switch (status)
+			{
+				case METRICS_QUERY_START:
+					gpmon_qlog_query_start(gpmonPacket);
+					break;
+				case METRICS_QUERY_SUBMIT:
+					/* convert to UTF8 which is encoding for gpperfmon database */
+					query_text = (char *)qd->sourceText;
+					/**
+					 * When client encoding and server encoding are different, do apply the conversion.
+					 */
+					if (GetDatabaseEncoding() != pg_get_client_encoding())
+					{
+						query_text = (char *)pg_do_encoding_conversion((unsigned char*)qd->sourceText,
+								strlen(qd->sourceText), GetDatabaseEncoding(), PG_UTF8);
+					}
+					gpmon_qlog_query_text(gpmonPacket,
+							query_text,
+							application_name,
+							NULL,
+							NULL);
+					gpmon_qlog_query_submit(gpmonPacket);
+					break;
+				case METRICS_QUERY_DONE:
+					gpmon_qlog_query_end(gpmonPacket);
+					break;
+					/* TODO: no GPMON_QLOG_STATUS for METRICS_QUERY_CANCELED */
+				case METRICS_QUERY_CANCELING:
+					gpmon_qlog_query_canceling(gpmonPacket);
+					break;
+				case METRICS_QUERY_ERROR:
+				case METRICS_QUERY_CANCELED:
+					gpmon_qlog_query_error(gpmonPacket);
+					break;
+				default:
+					break;
+			}
+			pfree(gpmonPacket);
+		}
+		PG_CATCH();
+		{
+			EmitErrorReport();
+			/* swallow any error in this hook */
+			FlushErrorState();
+			if (gpmonPacket != NULL)
+				pfree(gpmonPacket);
+		}
+		PG_END_TRY();
+	}
+	if (prev_query_info_collect_hook)
+		(*prev_query_info_collect_hook) (status, qd);
+}
+
+static void
+init_gpmon_hooks(void)
+{
+	prev_query_info_collect_hook =  query_info_collect_hook;
+	query_info_collect_hook = gpmon_query_info_collect_hook;
+}
+
+void
+_PG_init(void)
+{
+	time_t t;
+	if (!process_shared_preload_libraries_in_progress)
+	{
+		ereport(ERROR, (errmsg("gpmon not in shared_preload_libraries")));
+	}
+	else
+	{
+		if (!perfmon_enabled)
+			return;
+		/* add version info */
+		ereport(LOG, (errmsg("booting gpmon")));
+	}
+	init_gpmon_hooks();
+
+	t = time(NULL);
+
+	if (t == (time_t) -1)
+	{
+		elog(PANIC, "[perfmon] cannot generate global transaction id");
+	}
+	init_tmid = t;
+	gpmon_init();
+}
+
+void
+_PG_fini(void)
+{}
diff --git a/contrib/perfmon/src/gpsmon/Makefile b/contrib/perfmon/src/gpsmon/Makefile
new file mode 100644
index 00000000000..8484e3d3eb0
--- /dev/null
+++ b/contrib/perfmon/src/gpsmon/Makefile
@@ -0,0 +1,37 @@
+top_builddir = ../../../../
+
+PG_CPPFLAGS = -I$(libpq_srcdir) -I../include -I. -Wno-error=vla -Wno-vla
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/perfmon/src/gpsmon
+include $(top_builddir)/src/Makefile.global
+include $(top_builddir)/contrib/contrib-global.mk
+endif
+
+PLATFORM  = $(shell uname -s)
+ifeq ($(PLATFORM), darwin)
+GPSMON_LIBS=-framework CoreServices -framework IOKit
+SIGTEST_LIBS=-framework CoreServices -framework IOKit
+endif
+
+LDLIBS = $(LDFLAGS) $(LIBS)
+
+.PHONY: gpsmon
+GPSMON_OBJS=gpsmon.o ../common/gpmonlib.o
+GPSMON_LIBS+= -levent  -lapr-1 -laprutil-1 -lm -lsigar
+SIGTEST_LIBS+= -lsigar
+gpsmon: $(GPSMON_OBJS)
+	$(CC) -o gpsmon $(CFLAGS) $(GPSMON_OBJS) $(LDLIBS) $(GPSMON_LIBS)
+all: gpsmon
+
+sigartest: $(SIGTEST_OBJS)
+	$(CC) -o sigartest $(CFLAGS) $(SIGTEST_OBJS) $(LDLIBS) $(GPSMON_LIBS)
+
+clean distclean:
+	rm -f *.o gpsmon
+
+install: gpsmon
+	$(INSTALL_SCRIPT) gpsmon '$(DESTDIR)$(bindir)'
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
new file mode 100644
index 00000000000..1714cc531a4
--- /dev/null
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -0,0 +1,1772 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <math.h>
+#include <sys/param.h>
+#include <errno.h>
+#include "apr_getopt.h"
+#include "apr_env.h"
+#include "apr_hash.h"
+#include "apr_strings.h"
+#include "apr_pools.h"
+#include "gpmonlib.h"
+#include "gpmon.h"
+#include "apr_thread_proc.h"
+#include "event.h"
+#include "sigar.h"
+#include <time.h>
+
+#define FMT64 APR_INT64_T_FMT
+
+void update_log_filename(void);
+void gx_main(int, apr_int64_t);
+
+/* Temporary global memory to store the qexec line for a send*/
+char	qexec_smon_temp_line[QEXEC_MAX_ROW_BUF_SIZE];
+
+
+// no locking of log file in smon because it is single threaded
+apr_thread_mutex_t *logfile_mutex = NULL;
+
+static struct
+{
+	const char* pname;
+	int v;
+	int V;
+	int D;
+	const char* arg_port;
+	const char* log_dir;
+	apr_uint64_t max_log_size;
+	// The timeout in seconds for smon to restart if no requests
+	// come during that period.
+	apr_uint64_t terminate_timeout;
+} opt = { 0 };
+
+int verbose = 0; /* == opt.v */
+int very_verbose = 0; /* == opt.V */
+int number_cpu_cores = 1;
+float cpu_cores_utilization_multiplier = 1.0; /* multipy CPU % from libsigar by this factor to get the CPU % per machine */
+
+typedef struct pidrec_t pidrec_t;
+struct pidrec_t
+{
+	apr_uint64_t updated_tick; /* when this pidrec was updated */
+	apr_uint32_t pid;
+	char* pname;
+	char* cwd;
+	gpmon_proc_metrics_t p_metrics;
+	apr_uint64_t cpu_elapsed;
+	gpmon_qlogkey_t query_key;
+};
+
+typedef struct gx_t gx_t;
+struct gx_t
+{
+	int port;
+	apr_int64_t signature;
+	apr_uint64_t tick;
+	time_t now;
+
+	sigar_t* sigar;
+
+	/*fslist Does not incude remote filesystems and used for reporting metrics, not space avaliable & free.*/
+	const char** fslist;
+	const char** devlist;
+	const char** netlist;
+
+	/*This fs list includes remote filesystems and is used for reporting space avaliable & free. */
+	const char** allfslist;
+
+	SOCKET listen_sock;
+	struct event listen_event;
+
+	SOCKET tcp_sock;
+	struct event tcp_event;
+
+	SOCKET udp_sock;
+	struct event udp_event;
+
+	apr_pool_t* pool;
+	int qd_pid;
+	const char* hostname; /* my hostname */
+
+	/* hash tables */
+	apr_hash_t* qexectab; /* stores qexec packets */
+	apr_hash_t* qlogtab; /* stores qlog packets */
+	apr_hash_t* segmenttab; /* stores segment packets */
+	apr_hash_t* pidtab; /* key=pid, value=pidrec_t */
+	apr_hash_t* querysegtab; /* stores gpmon_query_seginfo_t */
+};
+
+typedef struct qexec_agg_hash_key_t {
+	apr_int32_t tmid;	/* transaction time */
+	apr_int32_t ssid;	/* session id */
+	apr_int32_t ccnt;	/* command count */
+	apr_int16_t nid;	/* plan node id */
+}qexec_agg_hash_key_t;
+
+
+typedef struct qexec_agg_t{
+	qexec_agg_hash_key_t key;
+	apr_hash_t* qexecaggtab;
+}qexec_agg_t;
+
+static struct gx_t gx = { 0 };
+
+/* structs and hash tables for metrics */
+static apr_hash_t* net_devices = NULL;
+static apr_hash_t* disk_devices = NULL;
+struct timeval g_time_last_reading = { 0 };
+
+typedef struct net_device_t net_device_t;
+struct net_device_t
+{
+	char* name;
+	apr_uint64_t rx_bytes;
+	apr_uint64_t tx_bytes;
+	apr_uint64_t rx_packets;
+	apr_uint64_t tx_packets;
+};
+
+typedef struct disk_device_t disk_device_t;
+struct disk_device_t
+{
+	char* name;
+	apr_uint64_t reads;
+	apr_uint64_t writes;
+	apr_uint64_t read_bytes;
+	apr_uint64_t write_bytes;
+};
+
+
+#define LOG_FILENAME_SIZE 64
+char log_filename[LOG_FILENAME_SIZE];
+void update_log_filename()
+{
+	time_t stamp = time(NULL);
+	struct tm* tm = gmtime(&stamp);
+	snprintf(log_filename, LOG_FILENAME_SIZE, "gpsmon.%d.%02d.%02d_%02d%02d%02d.log",
+		tm->tm_year + 1900,
+		tm->tm_mon + 1,
+		tm->tm_mday,
+		tm->tm_hour,
+		tm->tm_min,
+		tm->tm_sec);
+}
+
+static void gx_accept(SOCKET sock, short event, void* arg);
+static void gx_recvfrom(SOCKET sock, short event, void* arg);
+static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt);
+
+/**
+ * helper function to copy the union packet from a gpmon_packet_t to a gp_smon_to_mmon_packet_t
+ * @note This function should never be called with a qexec packet!
+ */
+static inline void copy_union_packet_gp_smon_to_mmon(gp_smon_to_mmon_packet_t* pkt, const gpmon_packet_t* pkt_src)
+{
+	switch (pkt_src->pkttype) {
+		case GPMON_PKTTYPE_HELLO:
+			memcpy(&pkt->u.hello, &pkt_src->u.hello, sizeof(gpmon_hello_t));
+			break;
+		case GPMON_PKTTYPE_METRICS:
+			memcpy(&pkt->u.metrics, &pkt_src->u.metrics, sizeof(gpmon_metrics_t));
+			break;
+		case GPMON_PKTTYPE_QLOG:
+			memcpy(&pkt->u.qlog, &pkt_src->u.qlog, sizeof(gpmon_qlog_t));
+			break;
+		case GPMON_PKTTYPE_SEGINFO:
+			memcpy(&pkt->u.seginfo, &pkt_src->u.seginfo, sizeof(gpmon_seginfo_t));
+			break;
+		case GPMON_PKTTYPE_QUERY_HOST_METRICS:
+			memcpy(&pkt->u.qlog, &pkt_src->u.qlog, sizeof(gpmon_qlog_t));
+			break;
+		case GPMON_PKTTYPE_FSINFO:
+			memcpy(&pkt->u.fsinfo, &pkt_src->u.fsinfo, sizeof(gpmon_fsinfo_t));
+			break;
+		case GPMON_PKTTYPE_QUERYSEG:
+		case GPMON_PKTTYPE_QEXEC:
+		default:
+			gpmon_fatal(FLINE, "Invalid pkttype %d for copy_union_packet_gp_smon_to_mmon\n", pkt_src->pkttype);
+			break;
+	}
+	return;
+}
+
+/**
+ * This local helper function allocates a gp_smon_to_mmon_packet_t and copies the gpmon_packet_t to it
+ * @note This function should never be called with a qexec packet!
+ */
+static gp_smon_to_mmon_packet_t* gx_pkt_to_smon_to_mmon(apr_pool_t* pool, const gpmon_packet_t* pkt)
+{
+	gp_smon_to_mmon_packet_t* t = apr_palloc(pool, sizeof(*t));
+	CHECKMEM(t);
+	gp_smon_to_mmon_set_header(t, pkt->pkttype);
+	copy_union_packet_gp_smon_to_mmon(t, pkt);
+	return t;
+}
+
+static void gx_exit(const char* reason)
+{
+	TR0(("exit %s\n", reason ? reason : "1"));
+	exit(reason ? 1 : 0);
+}
+
+static void send_fully(SOCKET sock, const void* p_, int len)
+{
+	const char* p = p_;
+	const char* q = p + len;
+	if (len == 0)
+		return;
+	while (p < q)
+	{
+		int n = send(sock, p, q - p, 0);
+		if (n == -1)
+		{
+			switch (errno)
+			{
+			case EINTR:
+			case EAGAIN:
+				continue;
+			}
+			gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "send failed");
+		}
+		p += n;
+	}
+}
+/* Helper function to send the header and then send the union packet */
+static void send_smon_to_mon_pkt(SOCKET sock, gp_smon_to_mmon_packet_t* pkt)
+{
+	send_fully(sock, &pkt->header, sizeof(gp_smon_to_mmon_header_t));
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QEXEC) {
+		send_fully(sock, &pkt->u.qexec_packet.data, sizeof(qexec_packet_data_t) );
+	} else {
+		send_fully(sock, &pkt->u, get_size_by_pkttype_smon_to_mmon(pkt->header.pkttype));
+	}
+	TR2(("Sent packet of type %d to mmon\n", pkt->header.pkttype));
+}
+
+static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt)
+{
+	apr_int32_t status;
+	sigar_proc_cpu_t cpu;
+	sigar_proc_mem_t mem;
+	sigar_proc_fd_t fd;
+	pidrec_t* rec;
+	apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
+
+	rec = apr_hash_get(gx.pidtab, &pid, sizeof(pid));
+	if (rec && rec->updated_tick == gx.tick)
+		return; /* updated in current cycle */
+
+	memset(&cpu, 0, sizeof(cpu));
+	memset(&mem, 0, sizeof(mem));
+	memset(&fd, 0, sizeof(fd));
+
+	TR2(("--------------------- starting %d\n", pid));
+
+	if (!rec)
+	{
+		sigar_proc_exe_t exe;
+
+		/* There might be cases where the pid no longer exist, so we'll just
+		 * zero out the memory first before doing anything */
+		rec = apr_pcalloc(pool, sizeof(*rec));
+		CHECKMEM(rec);
+
+		rec->pid = pid;
+		rec->query_key.tmid = tmid;
+		rec->query_key.ssid = ssid;
+		rec->query_key.ccnt = ccnt;
+
+		rec->pname = rec->cwd = 0;
+		if (0 == sigar_proc_exe_get(gx.sigar, pid, &exe))
+		{
+			rec->pname = apr_pstrdup(pool, exe.name);
+			rec->cwd = apr_pstrdup(pool, exe.root);
+		}
+		if (!rec->pname)
+			rec->pname = "unknown";
+		if (!rec->cwd)
+			rec->cwd = "unknown";
+
+		apr_hash_set(gx.pidtab, &rec->pid, sizeof(rec->pid), rec);
+	}
+
+	status = sigar_proc_mem_get(gx.sigar, pid, &mem);
+	/* ESRCH is error 3: (No such process) */
+	if (status != SIGAR_OK)
+	{
+		if (status != ESRCH) {
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+		}
+		return;
+	}
+
+	status = sigar_proc_cpu_get(gx.sigar, pid, &cpu);
+	if (status != SIGAR_OK)
+	{
+		if (status != ESRCH) {
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+		}
+		return;
+	}
+
+	status = sigar_proc_fd_get(gx.sigar, pid, &fd);
+	if (status != SIGAR_OK)
+	{
+		if (status != ESRCH) {
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+		}
+		return;
+	}
+
+	rec->updated_tick = gx.tick;
+	rec->p_metrics.fd_cnt = (apr_uint32_t) fd.total;
+	rec->p_metrics.cpu_pct = (float) (cpu.percent * cpu_cores_utilization_multiplier);
+	rec->p_metrics.mem.size = mem.size;
+	rec->p_metrics.mem.resident = mem.resident;
+
+#ifdef __linux__
+	rec->p_metrics.mem.share = mem.share;
+#else
+	rec->p_metrics.mem.share = 0;
+#endif
+
+	rec->cpu_elapsed = cpu.total;
+}
+
+
+
+#define FSUSAGE_TOBYTES(X) (X * 1024)
+
+static void send_fsinfo(SOCKET sock)
+{
+	sigar_file_system_usage_t fsusage;
+	gp_smon_to_mmon_packet_t  pkt;
+	const char**              fsdir;
+	int                       status = 0;
+
+	memset(&fsusage, 0, sizeof(sigar_file_system_usage_t));
+
+	for (fsdir = gx.fslist; *fsdir; fsdir++)
+	{
+		status = sigar_file_system_usage_get(gx.sigar, *fsdir, &fsusage);
+		if (status == SIGAR_OK)
+		{
+			TR2(("sigar_file_system_usage_get() succeeded. fsdir: %s total: %lu free: %lu used: %lu \n", *fsdir, fsusage.total, fsusage.free, fsusage.used));
+			memset(&pkt, 0, sizeof(gp_smon_to_mmon_packet_t));
+
+			gp_smon_to_mmon_set_header(&pkt,GPMON_PKTTYPE_FSINFO);
+
+			strncpy(pkt.u.fsinfo.key.fsname, *fsdir, sizeof(pkt.u.fsinfo.key.fsname) - 1);
+
+			pkt.u.fsinfo.bytes_used = FSUSAGE_TOBYTES(fsusage.used);
+			pkt.u.fsinfo.bytes_available = FSUSAGE_TOBYTES(fsusage.free);
+			pkt.u.fsinfo.bytes_total = FSUSAGE_TOBYTES(fsusage.total);
+			strncpy(pkt.u.fsinfo.key.hostname, gx.hostname, sizeof(pkt.u.fsinfo.key.hostname) - 1);
+
+			send_smon_to_mon_pkt(sock, &pkt);
+		}
+		else
+		{
+			TR2(("sigar_file_system_usage_get() failed.  fsdir: %s status: %i \n", *fsdir, status));
+		}
+	}
+}
+
+// Helper function to calculate the metric differences
+static apr_uint64_t metric_diff_calc( sigar_uint64_t newval, apr_uint64_t oldval, const char *name_for_log, const char* value_name_for_log ){
+	apr_uint64_t diff;
+
+	if (newval < oldval) // assume that the value was reset and we are starting over
+	{
+		TR0(("metric_diff_calc: new value %" APR_UINT64_T_FMT " is less than old value %" APR_UINT64_T_FMT " for %s metric %s; assume the value was reset and set diff to new value.\n",
+				newval, oldval, name_for_log, value_name_for_log));
+		diff = newval;
+	}
+	else
+	{
+		diff = newval - oldval;
+	}
+#if defined(rhel7_x86_64) || defined(rhel6_x86_64) || defined(suse10_x86_64)
+	// Add this debug on 64 bit machines to try and debug strange values we are seeing
+	if(diff > 1000000000000000000  ) {
+		TR0(("Crazy high value for diff! new value=%" APR_UINT64_T_FMT ", old value=%" APR_UINT64_T_FMT ", diff=%" APR_UINT64_T_FMT "  for %s metric %s; assume the value was reset and set diff to new value.\n",
+				newval, oldval, name_for_log, value_name_for_log));
+	}
+#endif
+	return diff;
+}
+
+
+// Helper function to calculate cpu percentage during a period
+static float calc_diff_percentage(sigar_uint64_t newvalue, sigar_uint64_t oldvalue, int total_diff, const char *itemname)
+{
+	float result = ((float) (newvalue - oldvalue) * 100 / total_diff);
+	if (newvalue < oldvalue)
+	{
+		TR0(("calc_diff_percentage: new value %" APR_UINT64_T_FMT " is less than old value %" APR_UINT64_T_FMT " for metric %s; set to 0.\n",
+				newvalue, oldvalue, itemname));
+		result = 0.0;
+	}
+	else if (result > 100)
+	{
+		TR0(("calc_diff_percentage: new value %" APR_UINT64_T_FMT " old value %" APR_UINT64_T_FMT " total diff %d for metric %s; set to 100.\n",
+				newvalue, oldvalue, total_diff, itemname));
+		result = 100;
+	}
+	return result;
+}
+
+static void send_machine_metrics(SOCKET sock)
+{
+	sigar_mem_t mem;
+	sigar_swap_t swap;
+	sigar_cpu_t cpu;
+	sigar_loadavg_t loadavg;
+	sigar_disk_usage_t tdisk;
+	sigar_net_interface_stat_t tnet;
+	static int first = 1;
+	static sigar_cpu_t pcpu = { 0 };
+	static sigar_swap_t pswap = { 0 };
+	gp_smon_to_mmon_packet_t pkt;
+	struct timeval currenttime = { 0 };
+	double seconds_duration = 0.0;
+	sigar_file_system_usage_t fsusage;
+	const char** fsdir;
+	const char** netname;
+	sigar_net_interface_stat_t netstat;
+	int cpu_total_diff;
+
+	/* NIC metrics */
+	apr_uint64_t rx_packets = 0;
+	apr_uint64_t tx_packets = 0;
+	apr_uint64_t rx_bytes = 0;
+	apr_uint64_t tx_bytes = 0;
+
+	/* Disk metrics */
+	apr_uint64_t reads = 0;
+	apr_uint64_t writes = 0;
+	apr_uint64_t read_bytes = 0;
+	apr_uint64_t write_bytes = 0;
+
+	memset(&mem, 0, sizeof(mem));
+	sigar_mem_get(gx.sigar, &mem);
+	TR2(("mem ram: %" FMT64 " total: %" FMT64 " used: %" FMT64 " free: %" FMT64 "\n",
+		 mem.ram, mem.total, mem.used, mem.free));
+
+	memset(&swap, 0, sizeof(swap));
+	sigar_swap_get(gx.sigar, &swap);
+	TR2(("swap total: %" FMT64 " used: %" FMT64 "page_in: %" FMT64 " page_out: %" FMT64 "\n",
+		 swap.total, swap.used, swap.page_in, swap.page_out));
+
+	memset(&cpu, 0, sizeof(cpu));
+	sigar_cpu_get(gx.sigar, &cpu);
+	TR2(("cpu user: %" FMT64 " sys: %" FMT64 " idle: %" FMT64 " wait: %" FMT64 " nice: %" FMT64 " total: %" FMT64 "\n",
+			cpu.user, cpu.sys, cpu.idle, cpu.wait, cpu.nice, cpu.total));
+
+	memset(&loadavg, 0, sizeof(loadavg));
+	sigar_loadavg_get(gx.sigar, &loadavg);
+	TR2(("load_avg: %e %e %e\n", loadavg.loadavg[0], loadavg.loadavg[1], loadavg.loadavg[2]));
+	memset(&tdisk, 0, sizeof(tdisk));
+	memset(&tnet, 0, sizeof(tnet));
+
+	for (fsdir = gx.fslist; *fsdir; fsdir++)
+	{
+		int e = sigar_file_system_usage_get(gx.sigar, *fsdir, &fsusage);
+
+		if (0 == e)
+		{
+			disk_device_t* disk = (disk_device_t*)apr_hash_get(disk_devices, *fsdir, APR_HASH_KEY_STRING);
+			/* Check if this is a new device */
+			if (!disk)
+			{
+				disk = (disk_device_t*)apr_palloc(gx.pool, sizeof(disk_device_t));
+				disk->name = apr_pstrdup(gx.pool, *fsdir);
+				disk->read_bytes = disk->write_bytes = disk->reads = disk->writes = 0;
+				apr_hash_set(disk_devices, disk->name, APR_HASH_KEY_STRING, disk);
+			}
+			reads = disk->reads;
+			writes = disk->writes;
+			read_bytes = disk->read_bytes;
+			write_bytes = disk->write_bytes;
+
+			// DISK READS
+			reads = metric_diff_calc(fsusage.disk.reads, disk->reads, disk->name, "disk reads");
+			disk->reads = fsusage.disk.reads; // old = new
+
+			// DISK WRITES
+			writes = metric_diff_calc(fsusage.disk.writes, disk->writes, disk->name, "disk writes");
+			disk->writes = fsusage.disk.writes; // old = new
+
+			// WRITE BYTES
+			write_bytes = metric_diff_calc(fsusage.disk.write_bytes, disk->write_bytes, disk->name, "disk write bytes");
+			disk->write_bytes = fsusage.disk.write_bytes; // old = new
+
+			// READ BYTES
+			read_bytes = metric_diff_calc(fsusage.disk.read_bytes, disk->read_bytes, disk->name, "disk read bytes");
+			disk->read_bytes = fsusage.disk.read_bytes; // old = new
+
+			tdisk.reads += reads;
+			tdisk.writes += writes;
+			tdisk.write_bytes += write_bytes;
+			tdisk.read_bytes += read_bytes;
+		}
+	}
+	TR2(("disk reads: %" APR_UINT64_T_FMT " writes: %" APR_UINT64_T_FMT
+		 " rbytes: %" APR_UINT64_T_FMT " wbytes: %" APR_UINT64_T_FMT "\n",
+		 tdisk.reads, tdisk.writes, tdisk.read_bytes, tdisk.write_bytes));
+
+	for (netname = gx.netlist; *netname; netname++)
+	{
+		int e = sigar_net_interface_stat_get(gx.sigar, *netname, &netstat);
+
+		if (0 == e)
+		{
+			net_device_t* nic = (net_device_t*)apr_hash_get(net_devices, *netname, APR_HASH_KEY_STRING);
+
+			/* Check if this is a new device */
+			if (!nic)
+			{
+				nic = (net_device_t*)apr_palloc(gx.pool, sizeof(net_device_t));
+				nic->name = apr_pstrdup(gx.pool, *netname);
+				nic->tx_bytes = nic->rx_bytes = nic->tx_packets = nic->rx_packets = 0;
+				apr_hash_set(net_devices, nic->name, APR_HASH_KEY_STRING, nic);
+			}
+
+			//////// RECEIVE PACKEtS
+			rx_packets = metric_diff_calc(netstat.rx_packets, nic->rx_packets, nic->name, "rx packets");
+			nic->rx_packets = netstat.rx_packets; // old = new
+
+			//////// RECEIVE BYTES
+			rx_bytes = metric_diff_calc(netstat.rx_bytes, nic->rx_bytes, nic->name, "rx bytes");
+			nic->rx_bytes = netstat.rx_bytes; // old = new
+
+			//////// SEND PACKETS
+			tx_packets = metric_diff_calc(netstat.tx_packets, nic->tx_packets, nic->name, "tx packets");
+			nic->tx_packets = netstat.tx_packets; // old = new
+
+			//////// SEND BYTES
+			tx_bytes = metric_diff_calc(netstat.tx_bytes, nic->tx_bytes, nic->name, "tx bytes");
+			nic->tx_bytes = netstat.tx_bytes; // old = new
+
+			tnet.rx_packets += rx_packets;
+			tnet.rx_bytes += rx_bytes;
+			tnet.tx_packets += tx_packets;
+			tnet.tx_bytes += tx_bytes;
+		}
+	}
+
+	TR2(("rx: %" APR_UINT64_T_FMT " rx_bytes: %" APR_UINT64_T_FMT "\n",
+					tnet.rx_packets, tnet.rx_bytes));
+	TR2(("tx: %" APR_UINT64_T_FMT " tx_bytes: %" APR_UINT64_T_FMT "\n",
+					tnet.tx_packets, tnet.tx_bytes));
+
+	if (first)
+	{
+		pswap = swap, pcpu = cpu;
+
+		/* We want 0s for these metrics on first pass rather
+		 * than some possibly huge number that will throw off
+		 * the UI graphs.
+		 */
+		memset(&tdisk, 0, sizeof(tdisk));
+		memset(&tnet, 0, sizeof(tnet));
+	}
+	first = 0;
+
+	gp_smon_to_mmon_set_header(&pkt,GPMON_PKTTYPE_METRICS);
+
+	pkt.u.metrics.mem.total = mem.total;
+	pkt.u.metrics.mem.used = mem.used;
+	pkt.u.metrics.mem.actual_used = mem.actual_used;
+	pkt.u.metrics.mem.actual_free = mem.actual_free;
+	pkt.u.metrics.swap.total = swap.total;
+	pkt.u.metrics.swap.used = swap.used;
+	pkt.u.metrics.swap.page_in = swap.page_in - pswap.page_in;
+	pkt.u.metrics.swap.page_out = swap.page_out - pswap.page_out;
+	cpu_total_diff = cpu.total - pcpu.total;
+	if (cpu_total_diff)
+	{
+		float cpu_user = calc_diff_percentage(cpu.user, pcpu.user, cpu_total_diff, "cpu.user") + calc_diff_percentage(cpu.nice, pcpu.nice, cpu_total_diff, "cpu.nice");
+		float cpu_sys  = calc_diff_percentage(cpu.sys,  pcpu.sys,  cpu_total_diff, "cpu.sys")  + calc_diff_percentage(cpu.wait, pcpu.wait, cpu_total_diff, "cpu.wait");
+		float cpu_idle = calc_diff_percentage(cpu.idle, pcpu.idle, cpu_total_diff, "cpu.idle");
+
+
+		pkt.u.metrics.cpu.user_pct = cpu_user;
+		pkt.u.metrics.cpu.sys_pct = cpu_sys;
+		pkt.u.metrics.cpu.idle_pct = cpu_idle;
+	}
+	else
+	{
+		pkt.u.metrics.cpu.user_pct = 0;
+		pkt.u.metrics.cpu.sys_pct = 0;
+		pkt.u.metrics.cpu.idle_pct = 0;
+	}
+	pkt.u.metrics.load_avg.value[0] = (float) loadavg.loadavg[0];
+	pkt.u.metrics.load_avg.value[1] = (float) loadavg.loadavg[1];
+	pkt.u.metrics.load_avg.value[2] = (float) loadavg.loadavg[2];
+
+	gettimeofday(&currenttime, NULL);
+	seconds_duration = subtractTimeOfDay(&g_time_last_reading, &currenttime);
+
+	pkt.u.metrics.disk.ro_rate = (apr_uint64_t)ceil(tdisk.reads/seconds_duration);
+	pkt.u.metrics.disk.wo_rate = (apr_uint64_t)ceil(tdisk.writes/seconds_duration);
+	pkt.u.metrics.disk.rb_rate = (apr_uint64_t)ceil(tdisk.read_bytes/seconds_duration);
+	pkt.u.metrics.disk.wb_rate = (apr_uint64_t)ceil(tdisk.write_bytes/seconds_duration);
+	pkt.u.metrics.net.rp_rate = (apr_uint64_t)ceil(tnet.rx_packets/seconds_duration);
+	pkt.u.metrics.net.wp_rate = (apr_uint64_t)ceil(tnet.tx_packets/seconds_duration);
+	pkt.u.metrics.net.rb_rate = (apr_uint64_t)ceil(tnet.rx_bytes/seconds_duration);
+	pkt.u.metrics.net.wb_rate = (apr_uint64_t)ceil(tnet.tx_bytes/seconds_duration);
+
+	g_time_last_reading = currenttime;
+
+	strncpy(pkt.u.metrics.hname, gx.hostname, sizeof(pkt.u.metrics.hname) - 1);
+	pkt.u.metrics.hname[sizeof(pkt.u.metrics.hname) - 1] = 0;
+	send_smon_to_mon_pkt(sock, &pkt);
+
+	/* save for next time around */
+	pswap = swap, pcpu = cpu;
+}
+
+static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
+{
+	char dump;
+	int n, e;
+	apr_pool_t* oldpool;
+	apr_hash_t* qetab;
+	apr_hash_t* qdtab;
+	apr_hash_t* pidtab;
+	apr_hash_t* segtab;
+	if (event & EV_TIMEOUT) // didn't get command from gpmmon, quit
+	{
+		if(gx.tcp_sock)
+		{
+			close(gx.tcp_sock);
+			gx.tcp_sock=0;
+			if (event_del(&gx.tcp_event))
+				gpsmon_fatal(FLINE, "event_del failed");
+		}
+		return;
+	}
+	apr_hash_t* querysegtab;
+	n = recv(sock, &dump, 1, 0);
+	if (n == 0)
+		gx_exit("peer closed");
+
+	if (n == -1)
+		gx_exit("socket error");
+
+	if (dump != 'D')
+		gx_exit("bad data");
+
+	TR1(("start dump %c\n", dump));
+
+	qetab = gx.qexectab;
+	qdtab = gx.qlogtab;
+	pidtab = gx.pidtab;
+	segtab = gx.segmenttab;
+	querysegtab = gx.querysegtab;
+
+	oldpool = apr_hash_pool_get(qetab);
+
+	/* make new  hashtabs for next cycle */
+	{
+		apr_pool_t* newpool;
+		if (0 != (e = apr_pool_create_alloc(&newpool, gx.pool)))
+		{
+			gpsmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
+		}
+		/* qexec hash table */
+		gx.qexectab = apr_hash_make(newpool);
+		CHECKMEM(gx.qexectab);
+
+		/* qlog hash table */
+		gx.qlogtab = apr_hash_make(newpool);
+		CHECKMEM(gx.qlogtab);
+
+		/* segment hash table */
+		gx.segmenttab = apr_hash_make(newpool);
+		CHECKMEM(gx.segmenttab);
+
+		/* queryseg hash table */
+		gx.querysegtab = apr_hash_make(newpool);
+		CHECKMEM(gx.querysegtab);
+
+		/* pidtab hash table */
+		gx.pidtab = apr_hash_make(newpool);
+		CHECKMEM(gx.pidtab);
+	}
+
+	/* push out a metric of the machine */
+	send_machine_metrics(sock);
+	send_fsinfo(sock);
+
+	/* push out records */
+	{
+		apr_hash_index_t* hi;
+		gp_smon_to_mmon_packet_t* ppkt = 0;
+		gp_smon_to_mmon_packet_t localPacketObject;
+		pidrec_t* pidrec;
+		int count = 0;
+		apr_hash_t* query_cpu_table = NULL;
+
+		for (hi = apr_hash_first(0, segtab); hi; hi = apr_hash_next(hi))
+		{
+ 			void* vptr;
+			apr_hash_this(hi, 0, 0, &vptr);
+			ppkt = vptr;
+			if (ppkt->header.pkttype != GPMON_PKTTYPE_SEGINFO)
+				continue;
+
+			/* fill in hostname */
+			strncpy(ppkt->u.seginfo.hostname, gx.hostname, sizeof(ppkt->u.seginfo.hostname) - 1);
+			ppkt->u.seginfo.hostname[sizeof(ppkt->u.seginfo.hostname) - 1] = 0;
+
+			TR2(("sending magic %x, pkttype %d\n", ppkt->header.magic, ppkt->header.pkttype));
+			send_smon_to_mon_pkt(sock, ppkt);
+			count++;
+		}
+
+		for (hi = apr_hash_first(0, qdtab); hi; hi = apr_hash_next(hi))
+		{
+ 			void* vptr;
+			apr_hash_this(hi, 0, 0, &vptr);
+			ppkt = vptr;
+			if (ppkt->header.pkttype != GPMON_PKTTYPE_QLOG)
+				continue;
+			TR2(("sending magic %x, pkttype %d\n", ppkt->header.magic, ppkt->header.pkttype));
+			send_smon_to_mon_pkt(sock, ppkt);
+			count++;
+		}
+
+		/*
+		 * QUERYSEG packets must be sent after QLOG packets so that gpmmon can
+		 * correctly populate its query_seginfo_hash.
+		 */
+		for (hi = apr_hash_first(0, querysegtab); hi; hi = apr_hash_next(hi))
+		{
+ 			void* vptr;
+			apr_hash_this(hi, 0, 0, &vptr);
+			ppkt = vptr;
+			if (ppkt->header.pkttype != GPMON_PKTTYPE_QUERYSEG)
+				continue;
+
+			TR2(("sending magic %x, pkttype %d\n", ppkt->header.magic, ppkt->header.pkttype));
+			send_smon_to_mon_pkt(sock, ppkt);
+			count++;
+		}
+
+		for (hi = apr_hash_first(0, qetab); hi; hi = apr_hash_next(hi))
+		{
+			gpmon_qexec_t* qexec;
+			void *vptr;
+
+			apr_hash_this(hi, 0, 0, &vptr);
+            qexec = vptr;
+            /* fill in _p_metrics */
+            pidrec = apr_hash_get(pidtab, &qexec->key.hash_key.pid, sizeof(qexec->key.hash_key.pid));
+            if (pidrec) {
+                qexec->_p_metrics = pidrec->p_metrics;
+                qexec->_cpu_elapsed = pidrec->cpu_elapsed;
+            } else {
+                memset(&qexec->_p_metrics, 0, sizeof(qexec->_p_metrics));
+            }
+
+			/* fill in _hname */
+			strncpy(qexec->_hname, gx.hostname, sizeof(qexec->_hname) - 1);
+			qexec->_hname[sizeof(qexec->_hname) - 1] = 0;
+
+			if (0 == create_qexec_packet(qexec, &localPacketObject)) {
+				break;
+			}
+
+			TR2(("sending qexec, pkttype %d\n", localPacketObject.header.pkttype));
+			send_smon_to_mon_pkt(sock, &localPacketObject);
+			count++;
+		}
+
+		// calculate CPU utilization per query for this machine
+		query_cpu_table = apr_hash_make(oldpool);
+		CHECKMEM(query_cpu_table);
+
+		// loop through PID's and add to Query CPU Hash Table
+		for (hi = apr_hash_first(0, pidtab); hi; hi = apr_hash_next(hi))
+		{
+			void* vptr;
+			pidrec_t* lookup;
+
+			apr_hash_this(hi, 0, 0, &vptr);
+			pidrec = vptr;
+
+			TR2(("tmid %d ssid %d ccnt %d pid %d (CPU elapsed %ld CPU Percent %.2f)\n",
+				pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
+				pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct));
+
+			// table is keyed on query key
+			lookup = apr_hash_get(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key));
+
+			if (lookup)
+			{
+				// found other pids with same query key so add the metrics to that
+
+				lookup->cpu_elapsed += pidrec->cpu_elapsed;
+				lookup->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
+			}
+			else
+			{
+				// insert existing pid record into table keyed by query key
+				apr_hash_set(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key), pidrec);
+			}
+
+		}
+
+		// reset packet to 0
+		ppkt = &localPacketObject;
+		memset(ppkt, 0, sizeof(gp_smon_to_mmon_packet_t));
+		gp_smon_to_mmon_set_header(ppkt,GPMON_PKTTYPE_QUERY_HOST_METRICS);
+
+		// add the hostname into the packet for DEBUGGING purposes only.  This is not used
+		strncpy(ppkt->u.qlog.user, gx.hostname, sizeof(ppkt->u.qlog.user) - 1);
+		ppkt->u.qlog.user[sizeof(ppkt->u.qlog.user) - 1] = 0;
+
+		// loop through the query per cpu table and send the metrics
+		for (hi = apr_hash_first(0, query_cpu_table); hi; hi = apr_hash_next(hi))
+		{
+			void* vptr;
+			apr_hash_this(hi, 0, 0, &vptr);
+			pidrec = vptr;
+
+			ppkt->u.qlog.key.tmid = pidrec->query_key.tmid;
+			ppkt->u.qlog.key.ssid = pidrec->query_key.ssid;
+			ppkt->u.qlog.key.ccnt = pidrec->query_key.ccnt;
+			ppkt->u.qlog.cpu_elapsed = pidrec->cpu_elapsed;
+			ppkt->u.qlog.p_metrics.cpu_pct = pidrec->p_metrics.cpu_pct;
+
+			TR2(("SEND tmid %d ssid %d ccnt %d (CPU elapsed %ld CPU Percent %.2f)\n",
+				ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt,
+				ppkt->u.qlog.cpu_elapsed, ppkt->u.qlog.p_metrics.cpu_pct));
+
+			send_smon_to_mon_pkt(sock, ppkt);
+			count++;
+		}
+
+		TR1(("end dump ... sent %d entries\n", count));
+	}
+
+	/* get rid of the old pool */
+	{
+		apr_pool_destroy(oldpool);
+	}
+	struct timeval tv;
+	tv.tv_sec = opt.terminate_timeout;
+	tv.tv_usec = 0;
+	if (event_add(&gx.tcp_event, opt.terminate_timeout ? &tv : NULL)) //reset timeout
+        {
+		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
+        }
+	return;
+}
+
+static void gx_accept(SOCKET sock, short event, void* arg)
+{
+	SOCKET nsock;
+	gp_smon_to_mmon_packet_t pkt;
+	struct sockaddr_in a;
+	socklen_t alen = sizeof(a);
+	char* p;
+	char* q;
+
+	if (event & EV_TIMEOUT)
+	{
+		if (gx.tcp_sock)
+		{
+			/* start watching connect request again */
+			if (event_add(&gx.listen_event, 0))
+			{
+				gpsmon_fatal(FLINE, "event_add failed");
+			}
+			return;
+		}
+		gpmon_fatal(FLINE, "smon terminates due to no requests come after %" FMT64 " seconds\n", opt.terminate_timeout);
+	}
+
+	if (0 == (event & EV_READ))
+		return;
+
+	if (-1 == (nsock = accept(sock, (void*) &a, &alen)))
+	{
+		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "accept failed");
+		return;
+	}
+
+	TR1(("accepted\n"));
+
+	/* we do this one at a time */
+	if (gx.tcp_sock)
+	{
+		gpmon_warning(FLINE, "cannot accept new connection before old one dies");
+		close(nsock);
+		return;
+	}
+
+	p = (char*) &pkt;
+	q = p + sizeof(pkt);
+	while (p < q)
+	{
+		int n = recv(nsock, p, q - p, 0);
+		if (n == -1)
+		{
+			gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "recv failed");
+			close(nsock);
+			return;
+		}
+		p += n;
+	}
+
+	if (0 != gpmon_ntohpkt(pkt.header.magic, pkt.header.version, pkt.header.pkttype))
+	{
+		close(nsock);
+		return;
+	}
+
+	if (pkt.header.pkttype != GPMON_PKTTYPE_HELLO)
+	{
+		close(nsock);
+		return;
+	}
+
+	if (pkt.u.hello.signature != gx.signature)
+	{
+		gx_exit("bad signature... maybe a new gpmmon has started");
+	}
+
+	/* echo the hello */
+	pkt.u.hello.pid = getpid();
+	TR2(("accepted pkt.magic = %x\n", (int) pkt.header.magic));
+	send_smon_to_mon_pkt(nsock, &pkt);
+
+	struct timeval tv;
+	tv.tv_sec = opt.terminate_timeout;
+	tv.tv_usec = 0;
+	event_set(&gx.tcp_event, nsock, EV_READ | EV_PERSIST | EV_TIMEOUT, gx_gettcpcmd, 0);
+	if (event_add(&gx.tcp_event, opt.terminate_timeout ? &tv : NULL))
+	{
+		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
+		close(nsock);
+		return;
+	}
+	gx.tcp_sock = nsock;
+	TR1(("connection established --------------------- \n"));
+}
+
+/* got a packet from peer. put it in the queue */
+static void gx_recvqlog(gpmon_packet_t* pkt)
+{
+	gpmon_qlog_t* p;
+	gp_smon_to_mmon_packet_t* rec;
+
+	if (pkt->pkttype != GPMON_PKTTYPE_QLOG)
+		gpsmon_fatal(FLINE, "assert failed; expected pkttype qlog");
+
+	p = &pkt->u.qlog;
+	TR2(("Received qlog packet for query %d-%d-%d.  Status now %d\n", p->key.tmid, p->key.ssid, p->key.ccnt, p->status));
+	rec = apr_hash_get(gx.qlogtab, &p->key, sizeof(p->key));
+	if (rec)
+	{
+		//memcpy(&rec->u.qlog, p, sizeof(*p));
+		merge_qlog(&rec->u.qlog, p);
+	}
+	else
+	{
+		rec = gx_pkt_to_smon_to_mmon(apr_hash_pool_get(gx.qlogtab), pkt);
+		apr_hash_set(gx.qlogtab, &rec->u.qlog.key, sizeof(rec->u.qlog.key), rec);
+	}
+}
+
+static void gx_recvsegment(gpmon_packet_t* pkt)
+{
+	gpmon_seginfo_t* p;
+	gp_smon_to_mmon_packet_t* rec;
+
+	if (pkt->pkttype != GPMON_PKTTYPE_SEGINFO)
+		gpsmon_fatal(FLINE, "assert failed; expected pkttype segment");
+
+	p = &pkt->u.seginfo;
+
+	TR2(("Received segment packet for dbid %d (dynamic_memory_used, dynamic_memory_available) (%lu %lu)\n", p->dbid, p->dynamic_memory_used, p->dynamic_memory_available));
+
+	rec = apr_hash_get(gx.segmenttab, &p->dbid, sizeof(p->dbid));
+	if (rec)
+	{
+		memcpy(&rec->u.seginfo, p, sizeof(*p));
+	}
+	else
+	{
+		rec = gx_pkt_to_smon_to_mmon(apr_hash_pool_get(gx.segmenttab), pkt);
+		apr_hash_set(gx.segmenttab, &rec->u.seginfo.dbid, sizeof(rec->u.seginfo.dbid), rec);
+	}
+}
+
+/**
+* write the qexec packet.
+* @return 1 if success, 0 if failure
+*/
+static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt)
+{
+	// Copy over needed values
+	memcpy(&pkt->u.qexec_packet.data.key, &qexec->key, sizeof(gpmon_qexeckey_t));
+	pkt->u.qexec_packet.data.measures_rows_in = qexec->rowsout;
+	pkt->u.qexec_packet.data._cpu_elapsed = qexec->_cpu_elapsed;
+	pkt->u.qexec_packet.data.rowsout = qexec->rowsout;
+
+	gp_smon_to_mmon_set_header(pkt,GPMON_PKTTYPE_QEXEC);
+	return 1;
+}
+
+static void extract_segments_exec(gpmon_packet_t* pkt)
+{
+	gpmon_qexec_t				*p;
+	gp_smon_to_mmon_packet_t	*rec;
+	gpmon_query_seginfo_key_t	qseg_key;
+	pidrec_t					*pidrec;
+
+	if (pkt->pkttype != GPMON_PKTTYPE_QEXEC)
+		gpsmon_fatal(FLINE, "assert failed; expected pkttype qexec");
+
+	p = &pkt->u.qexec;
+	qseg_key.qkey.tmid = p->key.tmid;
+	qseg_key.qkey.ssid = p->key.ssid;
+	qseg_key.qkey.ccnt = p->key.ccnt;
+	qseg_key.segid = p->key.hash_key.segid;
+
+	rec = apr_hash_get(gx.querysegtab, &qseg_key, sizeof(qseg_key));
+	pidrec = apr_hash_get(gx.pidtab, &p->key.hash_key.pid, sizeof(p->key.hash_key.pid));
+	ASSERT(pidrec);
+
+	if (rec)
+	{
+		rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
+		rec->u.queryseg.sum_measures_rows_out += p->rowsout;
+		if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1 && (int64)(p->rowsout) > rec->u.queryseg.final_rowsout)
+		{
+			rec->u.queryseg.final_rowsout = p->rowsout;
+		}
+	}
+	else
+	{
+		rec = apr_palloc(apr_hash_pool_get(gx.querysegtab),
+					sizeof(gp_smon_to_mmon_packet_t));
+		CHECKMEM(rec);
+		gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
+		rec->u.queryseg.key = qseg_key;
+		if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1)
+		{
+			rec->u.queryseg.final_rowsout = p->rowsout;
+		}
+		else
+		{
+			rec->u.queryseg.final_rowsout = -1;
+		}
+		rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
+		rec->u.queryseg.sum_measures_rows_out = p->rowsout;
+		apr_hash_set(gx.querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
+	}
+}
+
+static void gx_recvqexec(gpmon_packet_t* pkt)
+{
+	gpmon_qexec_t* p;
+
+	if (pkt->pkttype != GPMON_PKTTYPE_QEXEC)
+		gpsmon_fatal(FLINE, "assert failed; expected pkttype qexec");
+    TR2(("received qexec packet\n"));
+
+	p = &pkt->u.qexec;
+	get_pid_metrics(p->key.hash_key.pid,
+					p->key.tmid,
+					p->key.ssid,
+					p->key.ccnt);
+	// Store some aggregated information somewhere for metrics in
+	// queries_* tables, like cpu_elapsed, rows_out, and etc.
+	extract_segments_exec(pkt);
+	// We don't call gpmon_warning here because the number of
+	// packet is big, and we would make log boating.
+	return;
+}
+
+/* callback from libevent when a udp socket is ready to be read.
+ This function determines the packet type, then calls
+ gx_recvqlog() or gx_recvqexec().
+ */
+static void gx_recvfrom(SOCKET sock, short event, void* arg)
+{
+	gpmon_packet_t pkt;
+	struct sockaddr_in addr;
+	socklen_t addrlen = sizeof(addr);
+	int n;
+
+	if (!(event & EV_READ))
+		return;
+
+	n = recvfrom(sock, &pkt, sizeof(pkt), 0, (void*) &addr, &addrlen);
+	if (n == -1)
+	{
+		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "recvfrom failed");
+		return;
+	}
+
+	if (n != sizeof(pkt))
+	{
+		gpmon_warning(FLINE, "bad packet (length %d). Expected packet size %d", n, (int) sizeof(pkt));
+		return;
+	}
+
+	/* do some packet marshaling */
+	if (0 != gpmon_ntohpkt(pkt.magic, pkt.version, pkt.pkttype))
+	{
+		gpmon_warning(FLINE, "error with packet marshaling");
+		return;
+	}
+
+	/* process the packet */
+	switch (pkt.pkttype)
+	{
+	case GPMON_PKTTYPE_QLOG:
+		gx_recvqlog(&pkt);
+		break;
+	case GPMON_PKTTYPE_SEGINFO:
+		gx_recvsegment(&pkt);
+		break;
+	case GPMON_PKTTYPE_QEXEC:
+		gx_recvqexec(&pkt);
+		break;
+	default:
+		gpmon_warning(FLINE, "unexpected packet type %d", pkt.pkttype);
+		return;
+	}
+}
+
+static void setup_tcp(void)
+{
+	SOCKET sock = 0;
+
+	struct addrinfo hints;
+	struct addrinfo *addrs, *rp;
+	int  s;
+	char service[32];
+
+	/*
+	 * we let the system pick the TCP port here so we don't have to
+	 * manage port resources ourselves.
+	 */
+	snprintf(service,32,"%d",gx.port);
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;    	/* Allow IPv4 or IPv6 */
+	hints.ai_socktype = SOCK_STREAM; 	/* TCP socket */
+	hints.ai_flags = AI_PASSIVE;    	/* For wildcard IP address */
+	hints.ai_protocol = 0;          	/* Any protocol */
+
+	s = getaddrinfo(NULL, service, &hints, &addrs);
+	if (s != 0)
+		gpsmon_fatalx(FLINE, 0, "getaddrinfo says %s",gai_strerror(s));
+
+	/*
+	 * getaddrinfo() returns a list of address structures,
+	 * one for each valid address and family we can use.
+	 *
+	 * Try each address until we successfully bind.
+	 * If socket (or bind) fails, we (close the socket
+	 * and) try the next address.  This can happen if
+	 * the system supports IPv6, but IPv6 is disabled from
+	 * working, or if it supports IPv6 and IPv4 is disabled.
+	 */
+
+	/*
+	 * If there is both an AF_INET6 and an AF_INET choice,
+	 * we prefer the AF_INET6, because on UNIX it can receive either
+	 * protocol, whereas AF_INET can only get IPv4.  Otherwise we'd need
+	 * to bind two sockets, one for each protocol.
+	 *
+	 * Why not just use AF_INET6 in the hints?  That works perfect
+	 * if we know this machine supports IPv6 and IPv6 is enabled,
+	 * but we don't know that.
+	 */
+
+#ifdef HAVE_IPV6
+	if (addrs->ai_family == AF_INET && addrs->ai_next != NULL && addrs->ai_next->ai_family == AF_INET6)
+	{
+		/*
+		 * We got both an INET and INET6 possibility, but we want to prefer the INET6 one if it works.
+		 * Reverse the order we got from getaddrinfo so that we try things in our preferred order.
+		 * If we got more possibilities (other AFs??), I don't think we care about them, so don't
+		 * worry if the list is more that two, we just rearrange the first two.
+		 */
+		struct addrinfo *temp = addrs->ai_next; 	/* second node */
+		addrs->ai_next = addrs->ai_next->ai_next; 	/* point old first node to third node if any */
+		temp->ai_next = addrs;   					/* point second node to first */
+		addrs = temp;								/* start the list with the old second node */
+	}
+#endif
+
+	for (rp = addrs; rp != NULL; rp = rp->ai_next)
+	{
+		int on = 1;
+		struct linger linger;
+		/*
+		 * getaddrinfo gives us all the parameters for the socket() call
+		 * as well as the parameters for the bind() call.
+		 */
+
+		sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
+		if (sock == -1)
+			continue;
+
+		setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (void*) &on, sizeof(on));
+		setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void*) &on, sizeof(on));
+		linger.l_onoff = 1;
+		linger.l_linger = 5;
+		setsockopt(sock, SOL_SOCKET, SO_LINGER, (void*) &linger, sizeof(linger));
+
+
+		if (bind(sock, rp->ai_addr, rp->ai_addrlen) == 0)
+			break;              /* Success */
+
+		close(sock);
+	}
+
+	if (rp == NULL)
+	{               /* No address succeeded */
+		gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno),
+						"unable to bind tcp socket");
+	}
+
+	freeaddrinfo(addrs);
+
+	if (-1 == listen(sock, 5))
+	{
+		gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "listen failed");
+	}
+
+	/* save it */
+	gx.listen_sock = sock;
+	TR1(("TCP port %d opened\n", gx.port));
+
+	/* set up listen event, and associate with our event_base */
+	event_set(&gx.listen_event, sock, EV_READ | EV_PERSIST | EV_TIMEOUT, gx_accept, 0);
+
+	struct timeval accept_timeout;
+	accept_timeout.tv_sec = opt.terminate_timeout;
+	accept_timeout.tv_usec = 0;
+
+	/* start watching this event */
+	if (event_add(&gx.listen_event, opt.terminate_timeout ? &accept_timeout : 0))
+	{
+		gpsmon_fatal(FLINE, "event_add failed");
+	}
+
+}
+
+static void setup_udp()
+{
+	SOCKET sock = 0;
+
+	struct addrinfo hints;
+	struct addrinfo *addrs, *rp;
+	int  s;
+	char service[32];
+
+	/*
+	 * we let the system pick the TCP port here so we don't have to
+	 * manage port resources ourselves.
+	 */
+	snprintf(service,32,"%d",gx.port);
+	memset(&hints, 0, sizeof(struct addrinfo));
+	hints.ai_family = AF_UNSPEC;    	/* Allow IPv4 or IPv6 */
+	hints.ai_socktype = SOCK_DGRAM; 	/* UDP socket */
+	hints.ai_flags = AI_PASSIVE;    	/* For wildcard IP address */
+	hints.ai_protocol = 0;          	/* Any protocol */
+
+	s = getaddrinfo(NULL, service, &hints, &addrs);
+	if (s != 0)
+		gpsmon_fatalx(FLINE, 0, "getaddrinfo says %s",gai_strerror(s));
+
+	/*
+	 * getaddrinfo() returns a list of address structures,
+	 * one for each valid address and family we can use.
+	 *
+	 * Try each address until we successfully bind.
+	 * If socket (or bind) fails, we (close the socket
+	 * and) try the next address.  This can happen if
+	 * the system supports IPv6, but IPv6 is disabled from
+	 * working, or if it supports IPv6 and IPv4 is disabled.
+	 */
+
+	/*
+	 * If there is both an AF_INET6 and an AF_INET choice,
+	 * we prefer the AF_INET6, because on UNIX it can receive either
+	 * protocol, whereas AF_INET can only get IPv4.  Otherwise we'd need
+	 * to bind two sockets, one for each protocol.
+	 *
+	 * Why not just use AF_INET6 in the hints?  That works perfect
+	 * if we know this machine supports IPv6 and IPv6 is enabled,
+	 * but we don't know that.
+	 */
+
+#ifdef HAVE_IPV6
+	if (addrs->ai_family == AF_INET && addrs->ai_next != NULL && addrs->ai_next->ai_family == AF_INET6)
+	{
+		/*
+		 * We got both an INET and INET6 possibility, but we want to prefer the INET6 one if it works.
+		 * Reverse the order we got from getaddrinfo so that we try things in our preferred order.
+		 * If we got more possibilities (other AFs??), I don't think we care about them, so don't
+		 * worry if the list is more that two, we just rearrange the first two.
+		 */
+		struct addrinfo *temp = addrs->ai_next; 	/* second node */
+		addrs->ai_next = addrs->ai_next->ai_next; 	/* point old first node to third node if any */
+		temp->ai_next = addrs;   					/* point second node to first */
+		addrs = temp;								/* start the list with the old second node */
+	}
+#endif
+
+	for (rp = addrs; rp != NULL; rp = rp->ai_next)
+	{
+		/*
+		 * getaddrinfo gives us all the parameters for the socket() call
+		 * as well as the parameters for the bind() call.
+		 */
+
+		sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
+		if (sock == -1)
+			continue;
+
+		if (bind(sock, rp->ai_addr, rp->ai_addrlen) == 0)
+			break;              /* Success */
+
+		close(sock);
+	}
+
+	if (rp == NULL)
+	{               /* No address succeeded */
+		gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno),
+						"unable to bind udp socket");
+	}
+
+	/* save it */
+	gx.udp_sock = sock;
+
+	freeaddrinfo(addrs);
+
+	/* set up udp event */
+	event_set(&gx.udp_event, gx.udp_sock, EV_READ | EV_PERSIST, gx_recvfrom, 0);
+
+	/* start watching this event */
+	if (event_add(&gx.udp_event, 0))
+	{
+		gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
+	}
+}
+
+static const char* get_and_allocate_hostname()
+{
+	char hname[256] = { 0 };
+
+	if (gethostname(hname, sizeof(hname) - 1))
+	{
+		gx.hostname = strdup("unknown");
+		gpmon_warningx(FLINE, 0, "gethostname failed");
+	}
+	else
+	{
+		hname[sizeof(hname) - 1] = 0;
+		gx.hostname = strdup(hname);
+	}
+
+	return gx.hostname;
+}
+
+static void setup_gx(int port, apr_int64_t signature)
+{
+	int e;
+	apr_pool_t* subpool;
+
+	/* set up pool */
+	if (0 != (e = apr_pool_create_alloc(&gx.pool, 0)))
+	{
+		gpsmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
+	}
+
+	/* set up port, event */
+	gx.port = port;
+	gx.signature = signature;
+	if (!event_init())
+	{
+		gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_init failed");
+	}
+
+	if (0 != (e = apr_pool_create_alloc(&subpool, gx.pool)))
+	{
+		gpsmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
+	}
+
+	/* qexec hash table */
+	gx.qexectab = apr_hash_make(subpool);
+	CHECKMEM(gx.qexectab);
+
+	/* qlog hash table */
+	gx.qlogtab = apr_hash_make(subpool);
+	CHECKMEM(gx.qlogtab);
+
+	/* segment hash table */
+	gx.segmenttab = apr_hash_make(subpool);
+	CHECKMEM(gx.segmenttab);
+
+	/* queryseg hash table */
+	gx.querysegtab = apr_hash_make(subpool);
+	CHECKMEM(gx.querysegtab);
+
+	/* pidtab */
+	gx.pidtab = apr_hash_make(subpool);
+	CHECKMEM(gx.pidtab);
+
+	/* device metrics hashes */
+	net_devices = apr_hash_make(gx.pool);
+	CHECKMEM(net_devices);
+	disk_devices = apr_hash_make(gx.pool);
+	CHECKMEM(disk_devices);
+
+}
+
+static void setup_sigar(void)
+{
+	sigar_file_system_list_t sigar_fslist;
+	sigar_net_interface_list_t sigar_netlist;
+	int i, e, cnt;
+	int do_destroy = 0;
+
+	/* initialize sigar */
+	if (0 != (e = sigar_open(&gx.sigar)))
+	{
+		gpsmon_fatalx(FLINE, e, "sigar_open failed");
+	}
+
+	TR2(("sigar initialized\n"));
+	do_destroy = 1;
+	if (0 != sigar_net_interface_list_get(gx.sigar, &sigar_netlist))
+	{
+		memset(&sigar_netlist, 0, sizeof(sigar_netlist));
+		do_destroy = 0;
+	}
+	gx.netlist = apr_pcalloc(gx.pool, sizeof(const char*) * (1
+			+ sigar_netlist.number));
+	CHECKMEM(gx.netlist);
+	for (i = 0; i < sigar_netlist.number; i++)
+	{
+		gx.netlist[i] = apr_pstrdup(gx.pool, sigar_netlist.data[i]);
+		CHECKMEM(gx.netlist[i]);
+		TR2(("sigar net %d: %s\n", i, gx.netlist[i]));
+	}
+	if (do_destroy)
+		sigar_net_interface_list_destroy(gx.sigar, &sigar_netlist);
+
+	do_destroy = 1;
+	if (0 != sigar_file_system_list_get(gx.sigar, &sigar_fslist))
+	{
+		memset(&sigar_fslist, 0, sizeof(sigar_fslist));
+		do_destroy = 0;
+	}
+	cnt = 0;
+	TR2(("sigar fsnumber: %lu\n", sigar_fslist.number));
+	for (i = 0; i < sigar_fslist.number; i++)
+	{
+		if (sigar_fslist.data[i].type == SIGAR_FSTYPE_LOCAL_DISK)
+		{
+			TR2(("sigar cnt: %d\n", cnt + 1));
+			cnt++;
+		}
+	}
+	gx.fslist = apr_pcalloc(gx.pool, sizeof(const char*) * (cnt + 1));
+	CHECKMEM(gx.fslist);
+	gx.devlist = apr_pcalloc(gx.pool, sizeof(const char*) * (cnt + 1));
+	CHECKMEM(gx.devlist);
+	cnt = 0;
+	for (i = 0; i < sigar_fslist.number; i++)
+	{
+		if (sigar_fslist.data[i].type == SIGAR_FSTYPE_LOCAL_DISK)
+		{
+			gx.fslist[cnt]
+					= apr_pstrdup(gx.pool, sigar_fslist.data[i].dir_name);
+			CHECKMEM(gx.fslist[cnt]);
+			TR2(("fs: %s\n", gx.fslist[cnt]));
+			gx.devlist[cnt] = apr_pstrdup(gx.pool,
+					sigar_fslist.data[i].dev_name);
+			CHECKMEM(gx.devlist[cnt]);
+			cnt++;
+		}
+	}
+
+	cnt = 0;
+	for (i = 0; i < sigar_fslist.number; i++)
+	{
+		if (sigar_fslist.data[i].type == SIGAR_FSTYPE_LOCAL_DISK || sigar_fslist.data[i].type == SIGAR_FSTYPE_NETWORK)
+		{
+			TR2(("sigar cnt: %d\n", cnt + 1));
+			cnt++;
+		}
+	}
+	gx.allfslist = apr_pcalloc(gx.pool, sizeof(const char*) * (cnt + 1));
+	CHECKMEM(gx.allfslist);
+
+	cnt = 0;
+	for (i = 0; i < sigar_fslist.number; i++)
+	{
+		if (sigar_fslist.data[i].type == SIGAR_FSTYPE_LOCAL_DISK || sigar_fslist.data[i].type == SIGAR_FSTYPE_NETWORK)
+		{
+			gx.allfslist[cnt]
+					= apr_pstrdup(gx.pool, sigar_fslist.data[i].dir_name);
+			CHECKMEM(gx.allfslist[cnt]);
+			TR2(("allfs: %s\n", gx.allfslist[cnt]));
+			cnt++;
+		}
+	}
+
+	if (do_destroy)
+		sigar_file_system_list_destroy(gx.sigar, &sigar_fslist);
+}
+
+void gx_main(int port, apr_int64_t signature)
+{
+	/* set up our log files */
+	if (opt.log_dir)
+	{
+		mkdir(opt.log_dir, S_IRWXU | S_IRWXG);
+
+		if (0 != chdir(opt.log_dir))
+		{
+			/* Invalid dir for log file, try home dir */
+			char *home_dir = NULL;
+			if (0 == apr_env_get(&home_dir, "HOME", gx.pool))
+			{
+				if (home_dir)
+					chdir(home_dir);
+			}
+		}
+	}
+
+	update_log_filename();
+	freopen(log_filename, "w", stdout);
+	setlinebuf(stdout);
+
+	if (!get_and_allocate_hostname())
+		gpsmon_fatalx(FLINE, 0, "failed to allocate memory for hostname");
+	TR0(("HOSTNAME = '%s'\n", gx.hostname));
+
+
+
+	// first chace to write to log file
+	TR2(("signature = %" FMT64 "\n", signature));
+	TR1(("detected %d cpu cores\n", number_cpu_cores));
+
+	setup_gx(port, signature);
+	setup_sigar();
+	setup_udp();
+	setup_tcp();
+
+	gx.tick = 0;
+	for (;;)
+	{
+		struct timeval tv;
+		apr_hash_index_t* hi;
+
+		/* serve events every 2 second */
+		gx.tick++;
+		gx.now = time(NULL);
+		tv.tv_sec = 2;
+		tv.tv_usec = 0;
+
+		/* event dispatch blocks for a certain time based on the seconds given
+		 * to event_loopexit */
+		if (-1 == event_loopexit(&tv))
+		{
+			gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno),
+					"event_loopexit failed");
+		}
+
+		if (-1 == event_dispatch())
+		{
+			gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_dispatch failed");
+		}
+
+		/* get pid metrics */
+		for (hi = apr_hash_first(0, gx.qexectab); hi; hi = apr_hash_next(hi))
+		{
+            void* vptr;
+            gpmon_qexec_t* rec;
+            apr_hash_this(hi, 0, 0, &vptr);
+            rec = vptr;
+            get_pid_metrics(rec->key.hash_key.pid,
+                    rec->key.tmid,
+                    rec->key.ssid,
+                    rec->key.ccnt);
+		}
+
+		/* check log size */
+		if (gx.tick % 60 == 0)
+		{
+			apr_finfo_t finfo;
+			if (0 == apr_stat(&finfo, log_filename, APR_FINFO_SIZE, gx.pool))
+			{
+				if (opt.max_log_size != 0 && finfo.size > opt.max_log_size)
+				{
+					update_log_filename();
+					freopen(log_filename, "w", stdout);
+					setlinebuf(stdout);
+				}
+			}
+		}
+	}
+}
+
+static void usage(const char* msg)
+{
+	fprintf(stdout, "\nusage: %s [options] port\n\n", opt.pname);
+	fprintf(stdout, "options:\n");
+	fprintf(stdout, "\t-?:\tprint this help screen\n");
+	fprintf(stdout, "\t-v:\tverbose\n");
+	fprintf(stdout, "\t-D:\trun in debug mode; don't run as daemon\n");
+	fprintf(stdout, "\t-l:\tlog directory\n");
+	fprintf(stdout, "\t-m:\tmax log size\n");
+	fprintf(stdout, "\t-t:\tterminate timeout\n"),
+	fprintf(stdout, "\t-a:\titerator aggregate\n");
+	fprintf(stdout, "\t-i:\tignore qexec packet\n");
+	if (msg)
+		fprintf(stdout, "%s\n\n", msg);
+
+	exit(msg ? 1 : 0);
+}
+
+static void parse_command_line(int argc, const char* const argv[])
+{
+	apr_getopt_t* os;
+	int ch;
+	const char* arg;
+	const char* bin_start = NULL;
+	int e;
+	static apr_getopt_option_t option[] =
+	{
+	{ NULL, '?', 0, "print help screen" },
+	{ NULL, 'v', 1, "verbose" },
+	{ NULL, 'D', 0, "debug mode" },
+	{ NULL, 'l', 1, "log directory" },
+	{ NULL, 'm', 1, "max log size" },
+	{ NULL, 't', 1, "terminate timeout" },
+	{ NULL, 'a', 0, "iterator aggregate" },
+	{ NULL, 'i', 0, "ignore qexec packet" },
+	{ NULL, 0, 0, NULL } };
+	apr_pool_t* pool;
+
+	if (0 != (e = apr_pool_create_alloc(&pool, 0)))
+	{
+		gpsmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
+	}
+
+	bin_start = argv[0] + strlen(argv[0]) - 1;
+	while (bin_start != argv[0] && *bin_start != '/')
+		bin_start--;
+	if (bin_start[0] == '/')
+		bin_start++;
+
+	opt.pname = bin_start;
+	opt.v = opt.D = 0;
+	opt.max_log_size = 0;
+	opt.terminate_timeout = 0;
+
+	if (0 != (e = apr_getopt_init(&os, pool, argc, argv)))
+	{
+		gpsmon_fatalx(FLINE, e, "apr_getopt_init failed");
+	}
+
+	while (0 == (e = apr_getopt_long(os, option, &ch, &arg)))
+	{
+		switch (ch)
+		{
+		case '?':
+			usage(0);
+			break;
+		case 'v':
+			opt.v = atoi(arg);
+			break;
+		case 'D':
+			opt.D = 1;
+			break;
+		case 'l':
+			opt.log_dir = strdup(arg);
+			break;
+		case 'm':
+			opt.max_log_size = apr_atoi64(arg);
+			break;
+		case 't':
+			opt.terminate_timeout = apr_atoi64(arg);
+			break;
+		}
+	}
+
+	if (e != APR_EOF)
+		usage("Error: illegal arguments");
+
+	if (os->ind >= argc)
+		usage("Error: missing port argument");
+	opt.arg_port = argv[os->ind++];
+
+	apr_pool_destroy(pool);
+
+	verbose = opt.v;
+	very_verbose = opt.V;
+}
+
+int main(int argc, const char* const argv[])
+{
+	int port, e;
+	apr_int64_t signature;
+
+	if (0 != (e = apr_initialize()))
+	{
+		gpsmon_fatalx(FLINE, e, "apr_initialize failed");
+	}
+
+	parse_command_line(argc, argv);
+
+	port = atoi(opt.arg_port);
+	if (!(0 < port && port < (1 << 16)))
+		usage("Error: bad port number");
+
+	if (1 != fscanf(stdin, "%" FMT64, &signature))
+	{
+		gpsmon_fatal(FLINE, "cannot read signature");
+	}
+
+	if (!opt.D)
+	{
+		if (0 != (e = apr_proc_detach(1)))
+			gpsmon_fatalx(FLINE, e, "apr_proc_detach failed");
+	}
+
+
+	number_cpu_cores = (int)sysconf(_SC_NPROCESSORS_CONF);
+
+	// sanity check this number a little
+	if (number_cpu_cores < 1)
+	{
+		number_cpu_cores = 1;
+	}
+
+	cpu_cores_utilization_multiplier = 100.0 / (float)number_cpu_cores;
+
+	gx_main(port, signature);
+	return 0;
+}
+
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
new file mode 100644
index 00000000000..72a1088f001
--- /dev/null
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -0,0 +1,290 @@
+#ifndef GPMON_H
+#define GPMON_H
+
+extern void gpmon_init(void);
+
+extern int64 gpmon_tick;
+typedef struct gpmon_packet_t gpmon_packet_t;
+typedef struct gpmon_qlogkey_t gpmon_qlogkey_t;
+typedef struct gpmon_qlog_t gpmon_qlog_t;
+typedef struct gpmon_qexec_t gpmon_qexec_t;
+typedef struct gpmon_hello_t gpmon_hello_t;
+typedef struct gpmon_metrics_t gpmon_metrics_t;
+typedef struct gpmon_seginfo_t gpmon_seginfo_t;
+typedef struct gpmon_fsinfokey_t gpmon_fsinfokey_t;
+typedef struct gpmon_fsinfo_t gpmon_fsinfo_t;
+typedef struct gpmon_query_seginfo_key_t gpmon_query_seginfo_key_t;
+typedef struct gpmon_query_seginfo_t gpmon_query_seginfo_t;
+
+/*
+ * this dir sits in $MASTER_DATA_DIRECTORY. always include the
+ * suffix /
+ */
+#define GPMON_DIR   "./gpperfmon/data/"
+#define GPMON_LOG   "./gpperfmon/logs/"
+#define GPMON_ALERT_LOG_STAGE    "alert_log_stage"
+#define GPMON_DIR_MAX_PATH 100
+#define GPMON_DB    "gpperfmon"
+
+#define GPMON_FSINFO_MAX_PATH 255
+#define GPMON_UNKNOWN "Unknown"
+
+/* global guc variables */
+extern int perfmon_port; 
+extern bool perfmon_enabled;
+//extern bool perfmon_enable_query_metric;
+
+/*
+this is enough space for 2 names plus a . between the names plus a null char at the end of the string
+for example SCHEMA.RELATION\0
+*/
+#define SCAN_REL_NAME_BUF_SIZE (NAMEDATALEN*2)
+
+
+/* ------------------------------------------------------------------
+         INTERFACE
+   ------------------------------------------------------------------ */
+
+extern void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket);
+extern void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
+		const char *queryText,
+		const char *appName,
+		const char *resqName,
+		const char *resqPriority);
+extern void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket);
+extern void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket);
+extern void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket);
+extern void gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket);
+extern void gpmon_send(gpmon_packet_t*);
+extern void gpmon_gettmid(int32*);
+
+/* ------------------------------------------------------------------
+         FSINFO
+   ------------------------------------------------------------------ */
+
+struct gpmon_fsinfokey_t
+{
+	char fsname [GPMON_FSINFO_MAX_PATH];
+	char hostname[NAMEDATALEN];
+};
+
+struct gpmon_fsinfo_t
+{
+	gpmon_fsinfokey_t key;
+
+	int64 bytes_used;
+	int64 bytes_available;
+	int64 bytes_total;
+};
+
+/* ------------------------------------------------------------------
+         METRICS
+   ------------------------------------------------------------------ */
+struct gpmon_metrics_t
+{
+	char hname[NAMEDATALEN];
+	struct
+	{
+		uint64 total, used, actual_used, actual_free;
+	} mem;
+
+	struct
+	{
+		uint64 total, used, page_in, page_out;
+	} swap;
+
+	struct
+	{
+		float user_pct, sys_pct, idle_pct;
+	} cpu;
+
+	struct
+	{
+		float value[3];
+	} load_avg;
+
+	struct
+	{
+		uint64 ro_rate, wo_rate, rb_rate, wb_rate;
+	} disk;
+
+	struct
+	{
+		uint64 rp_rate, wp_rate, rb_rate, wb_rate;
+	} net;
+};
+
+
+/* ------------------------------------------------------------------
+         QLOG
+   ------------------------------------------------------------------ */
+
+struct gpmon_qlogkey_t {
+	int32 tmid;  /* transaction time */
+    int32 ssid; /* session id */
+    int32 ccnt; /* command count */
+};
+
+/* ------------------------------------------------------------------
+         QUERY SEGINFO
+   ------------------------------------------------------------------ */
+struct gpmon_query_seginfo_key_t
+{
+	gpmon_qlogkey_t		qkey;
+	int16		segid; /* segment id */
+};
+
+struct gpmon_query_seginfo_t
+{
+	gpmon_query_seginfo_key_t	key;
+	/*
+	 * final rowsout for segid = -1 and sliceid = 1, otherwise -1
+	 * if not exist for this segment.
+	 */
+	int64				final_rowsout;
+	uint64				sum_cpu_elapsed;
+	uint64				sum_measures_rows_out;
+};
+
+/* process metrics ... filled in by gpsmon */
+typedef struct gpmon_proc_metrics_t gpmon_proc_metrics_t;
+struct gpmon_proc_metrics_t {
+    uint32 fd_cnt;		/* # opened files / sockets etc */
+    float        cpu_pct;	/* cpu usage % */
+    struct {
+		uint64 size, resident, share;
+    } mem;
+};
+
+
+#define GPMON_QLOG_STATUS_INVALID -1
+#define GPMON_QLOG_STATUS_SILENT   0
+#define GPMON_QLOG_STATUS_SUBMIT   1
+#define GPMON_QLOG_STATUS_START    2
+#define GPMON_QLOG_STATUS_DONE     3
+#define GPMON_QLOG_STATUS_ERROR    4
+#define GPMON_QLOG_STATUS_CANCELING 5
+
+#define GPMON_NUM_SEG_CPU 10
+
+struct gpmon_qlog_t
+{
+	gpmon_qlogkey_t key;
+	char        user[NAMEDATALEN];
+	Oid		dbid;
+	int32 tsubmit, tstart, tfin;
+	int32 status;		/* GPMON_QLOG_STATUS_XXXXXX */
+	int32 cost;
+	int64 cpu_elapsed; /* CPU elapsed for query */
+	gpmon_proc_metrics_t p_metrics;
+	pid_t pid;
+};
+
+
+/* ------------------------------------------------------------------
+         QEXEC
+   ------------------------------------------------------------------ */
+
+typedef struct gpmon_qexec_hash_key_t {
+	int16 segid;	/* segment id */
+	int32 pid; 	/* process id */
+	int16 nid;	/* plan node id */
+}gpmon_qexec_hash_key_t;
+
+/* XXX According to CK.
+ * QE will NOT need to touch anything begin with _
+ */
+typedef struct gpmon_qexeckey_t {
+    int32 tmid;  /* transaction time */
+    int32 ssid; /* session id */
+    int32 ccnt;	/* command count */
+    gpmon_qexec_hash_key_t hash_key;
+}gpmon_qexeckey_t;
+
+struct gpmon_qexec_t {
+	gpmon_qexeckey_t key;
+	char		_hname[NAMEDATALEN];
+	uint8		status;    /* node status using PerfmonNodeStatus */
+	uint64		_cpu_elapsed; /* CPU elapsed for iter */
+	gpmon_proc_metrics_t 	_p_metrics;
+	uint64 		rowsout;
+};
+
+/*
+ * Segment-related statistics
+ */
+struct gpmon_seginfo_t {
+	int32 dbid; 							// dbid as in gp_segment_configuration
+	char hostname[NAMEDATALEN];					// hostname without NIC extension
+	uint64 dynamic_memory_used;			// allocated memory in bytes
+	uint64 dynamic_memory_available;		// available memory in bytes,
+};
+
+/* ------------------------------------------------------------------
+         HELLO
+   ------------------------------------------------------------------ */
+
+struct gpmon_hello_t {
+    int64 signature;
+    int32 pid; /* pid of gpsmon */
+};
+
+
+/*
+ * This value is a constant that identifies gpperfmon packets in general.
+ */
+#define GPMON_MAGIC     0x78ab928d
+
+/*
+ *  This version must match the most significant digit of the greenplum system version.
+ */
+#define GPMON_PACKET_VERSION   1
+#define GPMMON_PACKET_VERSION_STRING "gpmmon packet version 1\n"
+
+enum gpmon_pkttype_t {
+    GPMON_PKTTYPE_NONE = 0,
+    GPMON_PKTTYPE_HELLO = 1,
+    GPMON_PKTTYPE_METRICS = 2,
+    GPMON_PKTTYPE_QLOG = 3,
+    GPMON_PKTTYPE_QEXEC = 4,
+    GPMON_PKTTYPE_SEGINFO = 5,
+    GPMON_PKTTYPE_QUERY_HOST_METRICS = 7, // query metrics update from a segment such as CPU per query
+    GPMON_PKTTYPE_FSINFO = 8,
+    GPMON_PKTTYPE_QUERYSEG = 9,
+
+    GPMON_PKTTYPE_MAX
+};
+
+
+
+struct gpmon_packet_t {
+    /* if you modify this, do not forget to edit gpperfmon/src/gpmon/gpmonlib.c:gpmon_ntohpkt() */
+    int32 magic;
+    int16 version;
+    int16 pkttype;
+    union {
+		gpmon_hello_t   hello;
+		gpmon_metrics_t metrics;
+		gpmon_qlog_t    qlog;
+		gpmon_qexec_t   qexec;
+		gpmon_seginfo_t seginfo;
+		gpmon_fsinfo_t fsinfo;
+    } u;
+};
+
+
+extern const char* gpmon_qlog_status_string(int gpmon_qlog_status);
+
+/* when adding a node type for perfmon display be sure to also update the corresponding structures in
+   in gpperfmon/src/gpmon/gpmonlib.c */
+
+
+typedef enum PerfmonNodeStatus
+{
+	PMNS_Initialize = 0,
+	PMNS_Executing,
+	PMNS_Finished
+
+} PerfmonNodeStatus;
+
+#endif
diff --git a/contrib/perfmon/src/include/gpmonlib.h b/contrib/perfmon/src/include/gpmonlib.h
new file mode 100644
index 00000000000..2cf9fab5d0c
--- /dev/null
+++ b/contrib/perfmon/src/include/gpmonlib.h
@@ -0,0 +1,247 @@
+#ifndef GPMONLIB_H
+#define GPMONLIB_H
+
+#undef GP_VERSION
+#include "postgres_fe.h"
+
+#include "apr_general.h"
+#include "apr_time.h"
+#include "event.h"
+#include "gpmon.h"
+
+#define BATCH 8
+#define DEFAULT_GPMMON_LOGDIR "gpperfmon/logs"
+
+#ifndef WIN32
+typedef int SOCKET;
+#endif
+
+#define GPMON_DBUSER "gpmon"
+
+#define GPMON_PACKET_ERR_LOG_TIME 60
+
+#define FMT64 APR_INT64_T_FMT
+#define FMTU64 APR_UINT64_T_FMT
+#define FLINE __FILE__ ":" APR_STRINGIFY(__LINE__)
+#define CHECKMEM(x)  if (x) ; else gpmon_fatal(FLINE, "out of memory")
+#define ASSERT(x)    if (x) ; else gpmon_fatal(FLINE, "Check condition:%s failed", #x)
+
+extern int verbose;
+/* TODO: REMOVE */
+//extern int very_verbose;
+#define TR0(x) gpmon_print x
+#define TR1(x) if (verbose == 1) gpmon_print x
+#define TR2(x) if (verbose == 2) gpmon_print x
+#define TR1_FILE(x) if (verbose == 1) gpmon_print_file x
+
+/* Architecture specific limits for metrics */
+#if defined(osx104_x86) || defined(osx105_x86)
+	#define GPSMON_METRIC_MAX 0xffffffffUL
+#elif defined(rhel7_x86_64) || defined(rhel6_x86_64) || defined(suse10_x86_64)
+	#define GPSMON_METRIC_MAX 0xffffffffffffffffULL
+#else
+	#define GPSMON_METRIC_MAX 0xffffffffUL
+#endif
+
+#define GPMON_DATE_BUF_SIZE 24
+extern Oid gpperfmon_dbid;
+
+
+/* fatal & warning messages */
+extern int gpmon_print(const char* fmt, ...) pg_attribute_printf(1, 2);
+extern int gpmon_fatal(const char* fline, const char* fmt, ...)  pg_attribute_printf(2, 3);
+extern int gpmon_fatalx(const char* fline, int e, const char* fmt, ...)  pg_attribute_printf(3, 4);
+extern int gpmon_warning(const char* fline, const char* fmt, ...)  pg_attribute_printf(2, 3);
+extern int gpmon_warningx(const char* fline, int e, const char* fmt, ...)  pg_attribute_printf(3, 4);
+extern void gpmon_print_file(const char* header_line, FILE* fp);
+
+// fatal messages for smon -- go to stdout only
+extern int gpsmon_fatal(const char* fline, const char* fmt, ...)  pg_attribute_printf(2,3);
+extern int gpsmon_fatalx(const char* fline, int e, const char* fmt, ...) pg_attribute_printf(3, 4);
+
+/* convert packets to host order */
+extern apr_status_t gpmon_ntohpkt(apr_int32_t magic, apr_int16_t version, apr_int16_t pkttype);
+
+/* get the size of the union packet for smon_to_mon packets*/
+extern size_t get_size_by_pkttype_smon_to_mmon(apr_int16_t pkttype);
+
+/* strings */
+extern char* gpmon_trim(char* s);
+
+/* file manip */
+extern int gpmon_recursive_mkdir(char* work_dir);
+
+/* datetime, e.g. 2004-02-14  23:50:02 */
+extern char* gpmon_datetime(time_t t, char str[GPMON_DATE_BUF_SIZE]);
+
+/* version that rounds to lowest 5 sec interval */
+extern char* gpmon_datetime_rounded(time_t t, char str[GPMON_DATE_BUF_SIZE]);
+
+/* utility */
+extern apr_int32_t get_query_status(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt);
+extern char *get_query_text(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_pool_t *pool);
+
+#define DEFAULT_PATH_TO_HADOOP_HOST_FILE "/etc/gphd/gphdmgr/conf/clusterinfo.txt"
+#define PATH_TO_HADOOP_SMON_LOGS "/var/log/gphd/smon"
+
+#define MIN_MESSAGES_PER_INTERVAL (1)
+#define MAX_MESSAGES_PER_INTERVAL (50)
+#define MINIMUM_MESSAGE_INTERVAL  (1) // in minutes
+#define MAXIMUM_MESSAGE_INTERVAL  (10080) //one week
+
+#define MAX_QUERY_COMPARE_LENGTH  (1024 * 1024 * 10)
+/*
+* The below is a simple divide macro that does division rounding up if you get .5 or greater or down if you get below .5 without
+* doing any floating point math.  It can be used instead of the round math.h functions to avoid floating point math and crazy casting.
+* This is an example of how it works: 9/5 = 1.8 so you want the answer to be 2
+* the macro does the following: It first does the basic division which always rounds down: (9/5) = 1.  Then based on the remainder it
+* figures out whether or not to add 1.  Mod is used to calculate the remainder (9 mod 5) = 4.  Then it needs to calculate whether
+* 4/5 is greater or equal to .5 : it does this by calculating whether or not the remainder is greater than or equal to half the value of
+* 5.  This is done by dividing  5/2 and rounding appropriately (this is done by adding the remainder (5 mod 2) which is either 1 or 0). 5/2
+* rounded is 3 which is less than the remainder 4, so it rounds up and adds 1 getting 2 as the answer.
+* example 2, for 9/7 = 1.29 so you want the answer to be 1
+* step 1: (((9)/(7)) + ((((9)%(7))>=((((7)/2))+((7)%2)))?1:0))
+* step 2: 1 + ((2>=(3+1))?1:0))  step 3: 1 + ((2>=(4)?1:0))   step 4: 1 + 0  step 5: 1
+*
+* NOTE: THIS IS WELL TESTED; DO NOT CHANGE!!!
+*/
+#define ROUND_DIVIDE(numerator, denominator) (((numerator)/(denominator)) + ((((numerator)%(denominator))>=((((denominator)/2))+((denominator)%2)))?1:0))
+
+
+
+
+/* gpmmon options */
+typedef struct mmon_options_t
+{
+	int argc;
+	const char* const *argv;
+	const char* pname;
+	char* gpdb_port;
+	char* conf_file;
+	char* log_dir;
+	char* smon_log_dir;
+	char* smon_hadoop_swonly_clusterfile;
+	char* smon_hadoop_swonly_logdir;
+	char* smon_hadoop_swonly_binfile;
+	char* smdw_aliases;
+	apr_uint64_t max_log_size;
+	int max_fd; /* this is the max fd value we ever seen */
+	int v;
+	int quantum;
+	int min_query_time;
+	int qamode;
+	int harvest_interval;
+	apr_uint64_t tail_buffer_max;
+	int console;
+	int warning_disk_space_percentage;
+	int error_disk_space_percentage;
+	time_t disk_space_interval; // interval in seconds
+	unsigned int max_disk_space_messages_per_interval;
+	int partition_age;  		// in month
+} mmon_options_t;
+
+typedef struct addressinfo_holder_t addressinfo_holder_t;
+struct addressinfo_holder_t                                                                        
+{
+	char* address;  // an alternate host name to access this host                                                                           
+	char* ipstr; // the ipstring assoicated with this address
+	bool ipv6;
+	struct addressinfo_holder_t* next;
+};
+
+typedef struct multi_interface_holder_t multi_interface_holder_t;
+struct multi_interface_holder_t
+{
+	addressinfo_holder_t* current; 
+	unsigned int counter;
+};
+
+#define CONM_INTERVAL (16)
+#define CONM_LOOP_LAUNCH_FRAME (1)
+#define CONM_LOOP_BROKEN_FRAME (9)
+#define CONM_LOOP_HANG_FRAME   (12)
+
+#define GPSMON_TIMEOUT_NONE     (0)
+#define GPSMON_TIMEOUT_RESTART  (1)
+#define GPSMON_TIMEOUT_DETECTED (2)
+
+/* segment host */ 
+typedef struct host_t
+{
+	apr_thread_mutex_t *mutex; 
+	int sock; /* socket connected to gpsmon on this host */ 
+	int eflag; /* flag: socket has error */
+	struct event* event; /* points to _event if set */
+	struct event _event; 
+
+	char* hostname; 
+
+	addressinfo_holder_t* addressinfo_head; 
+	addressinfo_holder_t* addressinfo_tail; 
+
+	// there are 2 of these so we don't need to mutex
+	multi_interface_holder_t connection_hostname;	
+
+	apr_uint32_t address_count; 
+	char* smon_bin_location;
+	char* data_dir;
+	unsigned char is_master; /* 1 if host is the same host where the master runs */ 
+	unsigned char is_hdm; 
+	unsigned char is_hdw; 
+	unsigned char is_hbw; 
+	unsigned char is_hdc;
+	unsigned char is_etl; 
+	char ever_connected; /* set to non-zero after first connection attempt */
+	char connect_timeout;
+	apr_int32_t pid;
+} host_t;
+
+#define QEXEC_MAX_ROW_BUF_SIZE (2048)
+
+typedef struct qexec_packet_data_t
+{
+	gpmon_qexeckey_t 	key;
+	apr_uint64_t 		rowsout;
+	apr_uint64_t		_cpu_elapsed; /* CPU elapsed for iter */
+	apr_uint64_t 		measures_rows_in;
+} qexec_packet_data_t;
+
+typedef struct qexec_packet_t
+{
+	qexec_packet_data_t data;
+} qexec_packet_t;
+
+typedef struct gp_smon_to_mmon_header_t {
+	/* if you modify this, do not forget to edit gpperfmon/src/gpmon/gpmonlib.c:gpmon_ntohpkt() */
+	apr_int16_t pkttype;
+	apr_int32_t magic;
+	apr_int16_t version;
+} gp_smon_to_mmon_header_t;
+
+typedef struct gp_smon_to_mmon_packet_t {
+	gp_smon_to_mmon_header_t header;
+	union {
+		gpmon_hello_t   hello;
+		gpmon_metrics_t metrics;
+		gpmon_qlog_t    qlog;
+		qexec_packet_t  qexec_packet;
+		gpmon_seginfo_t seginfo;
+		gpmon_fsinfo_t fsinfo;
+		gpmon_query_seginfo_t queryseg;
+	} u;
+} gp_smon_to_mmon_packet_t;
+
+char* get_connection_hostname(host_t* host);
+char* get_connection_ip(host_t* host);
+bool get_connection_ipv6_status(host_t* host);
+void advance_connection_hostname(host_t* host);
+
+double subtractTimeOfDay(struct timeval* begin, struct timeval* end);
+
+/* Set header*/
+extern void gp_smon_to_mmon_set_header(gp_smon_to_mmon_packet_t* pkt, apr_int16_t pkttype);
+
+apr_status_t apr_pool_create_alloc(apr_pool_t ** newpool, apr_pool_t *parent);
+void gpdb_get_single_string_from_query(const char* QUERY, char** resultstring, apr_pool_t* pool);
+void merge_qlog(gpmon_qlog_t* qlog, const gpmon_qlog_t* newqlog);
+#endif /* GPMONLIB_H */
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8a195a39856..b72aa2b89f6 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -230,6 +230,7 @@ enable_debug_extensions = @enable_debug_extensions@
 enable_orafce		= @enable_orafce@
 enable_mapreduce    = @enable_mapreduce@
 enable_shared_postgres_backend    = @enable_shared_postgres_backend@
+enable_perfmon    = @enable_perfmon@
 enable_gpcloud 	    = @enable_gpcloud@
 enable_ic_proxy     = @enable_ic_proxy@
 enable_pax          = @enable_pax@
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 379a94ae56a..b9c58b62937 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -1067,6 +1067,9 @@
 /* Define to 1 to build with pax support. (--enable-pax) */
 #undef USE_PAX_STORAGE
 
+/* Define to 1 to build with perfmon. (--enable-perfmon) */
+#undef USE_PERFMON
+
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
diff --git a/src/include/utils/process_shared_preload_libraries.h b/src/include/utils/process_shared_preload_libraries.h
index fe098ccf401..98cea2b6d21 100644
--- a/src/include/utils/process_shared_preload_libraries.h
+++ b/src/include/utils/process_shared_preload_libraries.h
@@ -4,3 +4,6 @@
 #ifdef USE_PAX_STORAGE
 		"pax",
 #endif
+#ifdef USE_PERFMON
+		"gpmmon","gpmon"
+#endif

From 5e0781a3ed8606396a14b509e75bd0659dd553e9 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Wed, 29 May 2024 16:24:03 +0800
Subject: [PATCH 02/40] Update gpperfmon.conf and gpperfmon_install

* set quantum in gpperfmon.conf to 5s
* enable gpmon user co connect databases remotely in gpperfmon_install
---
 contrib/perfmon/Makefile            | 6 +-----
 contrib/perfmon/gpperfmon.conf      | 4 ++--
 contrib/perfmon/gpperfmon_install   | 8 ++++----
 contrib/perfmon/src/include/gpmon.h | 2 +-
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index 196ec42acf3..55c13bf2607 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -1,10 +1,7 @@
 NAME = gpperfmon
 top_builddir = ../../
-REGRESS = guc_config query
 
-PG_CONFIG ?= pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
+
 ifdef USE_PGXS
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
@@ -39,4 +36,3 @@ install:  installdirs
 	mkdir -p '$(DESTDIR)$(libdir)/$(NAME)'
 	$(INSTALL_SCRIPT) $(NAME).sql '$(DESTDIR)$(libdir)/$(NAME)'
 	$(INSTALL_SCRIPT) $(NAME).conf '$(DESTDIR)$(libdir)/$(NAME)'
-
diff --git a/contrib/perfmon/gpperfmon.conf b/contrib/perfmon/gpperfmon.conf
index e8bc342001e..f2d5616c00a 100644
--- a/contrib/perfmon/gpperfmon.conf
+++ b/contrib/perfmon/gpperfmon.conf
@@ -2,7 +2,7 @@
 # quantum specifies the time in seconds between updates from
 # performance monitor agents on all segments. Valid values
 # are 5, 10, 15, 20, 30, or 60
-quantum = 15
+quantum = 5
 
 # min_query_time specifies the minimum query run time
 # in seconds for statistics collection. The monitor logs all
@@ -27,7 +27,7 @@ min_query_time = 20
 # specified, no errors are sent.
 #error_disk_space_percentage = 90
 
-gThis is the interval in minutes that limits the number of
+#This is the interval in minutes that limits the number of
 #error/warning messages that are sent. The minimum value for
 #this configuration is 1.  Setting this to 0 or not specifying
 #this configuration results in it getting set to the minimum.
diff --git a/contrib/perfmon/gpperfmon_install b/contrib/perfmon/gpperfmon_install
index 27b8c935d93..a4ffb416369 100755
--- a/contrib/perfmon/gpperfmon_install
+++ b/contrib/perfmon/gpperfmon_install
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 '''
-USAGE:   gpperfmon_install --port GPDB_PORT [--enable --password GPMON_PASSWORD] [--pgpass PATH_TO_FILE] [--gpperfmonport GPPERFMON_PORT] [--verbose]
+USAGE:   gpperfmon_install --port CBDB_PORT [--enable --password GPMON_PASSWORD] [--pgpass PATH_TO_FILE] [--gpperfmonport GPPERFMON_PORT] [--verbose]
 
          where this script will install the gpperfmon database and schema
 
@@ -14,7 +14,7 @@ USAGE:   gpperfmon_install --port GPDB_PORT [--enable --password GPMON_PASSWORD]
          when using --enable, --password must be specified
 
         --password will set the password for gpmon superuser
-        --port is the port used by gpperfmon to connect to GPDB
+        --port is the port used by gpperfmon to connect to CBDB
         --pgpass is an option to allow overriding default path of $HOME/.pgpass
         --gpperfmonport sets the guc 'gpperfmon_port' for gpperfmon communication (default is 8888)
         --verbose will show output from sub-commands
@@ -181,10 +181,10 @@ if __name__ == '__main__':
         cmd = Command("""echo "local    gpperfmon         gpmon         md5" >> %s""" % pg_hba, showOutput=True)
         commands.append(cmd)
 
-        cmd = Command("""echo "host     all         gpmon         127.0.0.1/28    md5" >> %s""" % pg_hba, showOutput=True)
+        cmd = Command("""echo "host     all         gpmon         0.0.0.0/0    md5" >> %s""" % pg_hba, showOutput=True)
         commands.append(cmd)
 
-        cmd = Command("""echo "host     all         gpmon         ::1/128    md5" >> %s""" % pg_hba, showOutput=True)
+        cmd = Command("""echo "host     all         gpmon         ::0/0    md5" >> %s""" % pg_hba, showOutput=True)
         commands.append(cmd)
 
         ################################################
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index 72a1088f001..7f3d49e539e 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -236,7 +236,7 @@ struct gpmon_hello_t {
 #define GPMON_MAGIC     0x78ab928d
 
 /*
- *  This version must match the most significant digit of the greenplum system version.
+ *  This version must match the most significant digit of the cloudberrydb system version.
  */
 #define GPMON_PACKET_VERSION   1
 #define GPMMON_PACKET_VERSION_STRING "gpmmon packet version 1\n"

From b0351f519acfeb66f604ed59df6c87ca885aaf50 Mon Sep 17 00:00:00 2001
From: Xiaoran Wang <fanfuxiaoran@gmail.com>
Date: Thu, 30 May 2024 16:57:15 +0800
Subject: [PATCH 03/40] Fix gpperfmon

Allow gpmon to access db from local host
either 127.0.0.1 or the host's public ip
---
 contrib/perfmon/gpperfmon_install | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/contrib/perfmon/gpperfmon_install b/contrib/perfmon/gpperfmon_install
index a4ffb416369..fab55f5f13e 100755
--- a/contrib/perfmon/gpperfmon_install
+++ b/contrib/perfmon/gpperfmon_install
@@ -22,6 +22,7 @@ USAGE:   gpperfmon_install --port CBDB_PORT [--enable --password GPMON_PASSWORD]
 
 import os, sys, time, re
 from subprocess import Popen
+from subprocess import PIPE
 
 try:
     from optparse import Option, OptionParser
@@ -181,12 +182,21 @@ if __name__ == '__main__':
         cmd = Command("""echo "local    gpperfmon         gpmon         md5" >> %s""" % pg_hba, showOutput=True)
         commands.append(cmd)
 
-        cmd = Command("""echo "host     all         gpmon         0.0.0.0/0    md5" >> %s""" % pg_hba, showOutput=True)
+        cmd = Command("""echo "host     all         gpmon         127.0.0.1/28    md5" >> %s""" % pg_hba, showOutput=True)
         commands.append(cmd)
 
-        cmd = Command("""echo "host     all         gpmon         ::0/0    md5" >> %s""" % pg_hba, showOutput=True)
+        cmd = Command("""echo "host     all         gpmon         ::1/128    md5" >> %s""" % pg_hba, showOutput=True)
         commands.append(cmd)
 
+        # get ip of the host
+        process = Popen("ifconfig | awk '$1==\"inet\"{print $2}'", shell=True,  executable="/bin/bash", stdout=PIPE)
+        out = process.stdout.readlines()
+        for ip in out:
+            ip = ip.strip().decode('ascii')
+            if ip != "127.0.0.1":
+                cmd = Command("""echo "host     all         gpmon         %s/32    md5" >> %s""" % (ip, pg_hba), showOutput=True)
+                commands.append(cmd)
+
         ################################################
         # these commands add a new line to the top of .pgpass and save a copy of old .pgpass
         cmd = Command("""touch %s""" % (pg_pass))
@@ -195,7 +205,7 @@ if __name__ == '__main__':
         cmd = Command("""mv -f %s %s""" % (pg_pass, old_pg_pass))
         commands.append(cmd)
 
-        cmd = Command("""echo "*:%d:gpperfmon:gpmon:%s" >> %s""" % (options.port, options.password, pg_pass), showOutput=True)
+        cmd = Command("""echo "*:%d:*:gpmon:%s" >> %s""" % (options.port, options.password, pg_pass), showOutput=True)
         commands.append(cmd)
 
         cmd = Command("""cat %s >> %s""" % (old_pg_pass, pg_pass), showOutput=True)

From bc5664406af195e7457ebe9607bc9a9020a6030e Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Thu, 13 Jun 2024 15:50:01 +0800
Subject: [PATCH 04/40] perfmon supports python3 and add regress test

Replace python with python3 in gpperfmon.sql
Add regress test for perfmon, but due to  "cluster validation failed"
after restarting the cluster, just disable the test and will
fix it later.
---
 contrib/perfmon/Makefile                   | 13 +++-
 contrib/perfmon/expected/post_run.out      | 39 ++++++++++++
 contrib/perfmon/expected/pre_run_check.out | 51 +++++++++++++++
 contrib/perfmon/expected/query.out         | 72 ++++++++++++++++++++++
 contrib/perfmon/gpmon_catqrynow.py         |  2 +-
 contrib/perfmon/gpperfmon.sql              |  4 +-
 contrib/perfmon/sql/post_run.sql           |  9 +++
 contrib/perfmon/sql/pre_run_check.sql      | 18 ++++++
 contrib/perfmon/sql/query.sql              | 18 ++++--
 9 files changed, 215 insertions(+), 11 deletions(-)
 create mode 100644 contrib/perfmon/expected/post_run.out
 create mode 100644 contrib/perfmon/expected/pre_run_check.out
 create mode 100644 contrib/perfmon/sql/post_run.sql
 create mode 100644 contrib/perfmon/sql/pre_run_check.sql

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index 55c13bf2607..ecc6a0283db 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -1,29 +1,34 @@
 NAME = gpperfmon
-top_builddir = ../../
 
+REGRESS = pre_run_check guc_config query post_run
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 else
+top_builddir = ../../
 subdir = contrib/perfmon
 include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
 endif
 
-clean distclean:
+.PHONY: clean_perfmon
+clean_perfmon:
 	$(MAKE) -C src/gpmon clean
 	$(MAKE) -C src/gpmmon clean
 	$(MAKE) -C src/gpsmon clean
 	rm -rf gpmon.so
 	rm -rf gpsmon
 	rm -rf gpperfmon.so
+clean distclean: clean_perfmon
 
 all:
 	$(MAKE) -C src/gpmon all
 	$(MAKE) -C src/gpmmon all
 	$(MAKE) -C src/gpsmon all
 
+.PHONY: installdirs
 installdirs:
 	$(MKDIR_P) '$(DESTDIR)$(bindir)/../sbin'
 install:  installdirs
@@ -36,3 +41,7 @@ install:  installdirs
 	mkdir -p '$(DESTDIR)$(libdir)/$(NAME)'
 	$(INSTALL_SCRIPT) $(NAME).sql '$(DESTDIR)$(libdir)/$(NAME)'
 	$(INSTALL_SCRIPT) $(NAME).conf '$(DESTDIR)$(libdir)/$(NAME)'
+
+.PHONY: icw_test
+	icw_test:
+	    make -C $(REGRESS_DIR) icw_test
diff --git a/contrib/perfmon/expected/post_run.out b/contrib/perfmon/expected/post_run.out
new file mode 100644
index 00000000000..07ffcc4e550
--- /dev/null
+++ b/contrib/perfmon/expected/post_run.out
@@ -0,0 +1,39 @@
+-- start_ignore
+\! gpconfig -c perfmon.enable -v 'off'
+20240612:11:02:26:001401 gpconfig:xx:gpadmin-[INFO]:-completed successfully with parameters '-c perfmon.enable -v off'
+\! gpstop -ari
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Starting gpstop with args: -ari
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Gathering information and validating the environment...
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Obtaining CloudberryDB Coordinator catalog information
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Obtaining Segment details from coordinator...
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-CloudberryDB Version: 'postgres (Cloudberry Database) 1.5.3+dev.67.g19a252cacb build dev'
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Commencing Coordinator instance shutdown with mode='immediate'
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Coordinator segment instance directory=/home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Attempting forceful termination of any leftover coordinator process
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Terminating processes for segment /home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1
+20240612:11:21:12:026195 gpstop:xx:gpadmin-[INFO]:-Stopping coordinator standby host xx mode=immediate
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-Successfully shutdown standby process on xx
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-Targeting dbid [2, 5, 3, 6, 4, 7] for shutdown
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-Commencing parallel primary segment instance shutdown, please wait...
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-0.00% of jobs completed
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-100.00% of jobs completed
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-Commencing parallel mirror segment instance shutdown, please wait...
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-0.00% of jobs completed
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-100.00% of jobs completed
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-----------------------------------------------------
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-   Segments stopped successfully      = 6
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-   Segments with errors during stop   = 0
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-----------------------------------------------------
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-Successfully shutdown 6 of 6 segment instances 
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-Database successfully shutdown with no errors reported
+20240612:11:21:13:026195 gpstop:xx:gpadmin-[INFO]:-Restarting System...
+-- end_ignore
+\! gpconfig -s perfmon.enable
+Values on all segments are consistent
+GUC              : perfmon.enable
+Coordinator value: off
+Segment     value: off
+-- start_ignore
+\c contrib_regression
+drop database if exists gpperfmon;
+-- end_ignore
diff --git a/contrib/perfmon/expected/pre_run_check.out b/contrib/perfmon/expected/pre_run_check.out
new file mode 100644
index 00000000000..e89807fb148
--- /dev/null
+++ b/contrib/perfmon/expected/pre_run_check.out
@@ -0,0 +1,51 @@
+-- start_ignore
+drop database if exists gpperfmon;
+\! gpperfmon_install --enable --port $PGPORT --password 123
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-createdb gpperfmon >& /dev/null
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-PGPORT=7000 psql -f /home/gpadmin/install/gpdb/lib/gpperfmon/gpperfmon.sql gpperfmon >& /dev/null
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-PGPORT=7000 psql template1 -c "DROP ROLE IF EXISTS gpmon"  >& /dev/null
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-PGPORT=7000 psql template1 -c "CREATE ROLE gpmon WITH SUPERUSER CREATEDB LOGIN ENCRYPTED PASSWORD '********'"  >& /dev/null
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-echo "local    gpperfmon         gpmon         md5" >> /home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/pg_hba.conf
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-echo "host     all         gpmon         127.0.0.1/28    md5" >> /home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/pg_hba.conf
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-echo "host     all         gpmon         ::1/128    md5" >> /home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/pg_hba.conf
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-echo "host     all         gpmon         172.17.0.1/32    md5" >> /home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/pg_hba.conf
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-echo "host     all         gpmon         192.168.176.231/32    md5" >> /home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/pg_hba.conf
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-touch /home/gpadmin/.pgpass >& /dev/null
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-mv -f /home/gpadmin/.pgpass /home/gpadmin/.pgpass.1717553812 >& /dev/null
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-echo "*:7000:*:gpmon:123" >> /home/gpadmin/.pgpass
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-cat /home/gpadmin/.pgpass.1717553812 >> /home/gpadmin/.pgpass
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-chmod 0600 /home/gpadmin/.pgpass >& /dev/null
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-cp /home/gpadmin/install/gpdb/lib/gpperfmon/gpperfmon.conf /home/gpadmin/workspace/hashdata-lightning/gpAux/gpdemo/datadirs/qddir/demoDataDir-1/gpperfmon/conf
+20240605:10:16:52:017713 gpperfmon_install:xx:gpadmin-[INFO]:-PGPORT=7000 gpconfig -c perfmon.enable -v on >& /dev/null
+20240605:10:16:53:017713 gpperfmon_install:xx:gpadmin-[INFO]:-PGPORT=7000 gpconfig -c perfmon.port -v 8888 >& /dev/null
+20240605:10:16:54:017713 gpperfmon_install:xx:gpadmin-[INFO]:-PGPORT=7000 gpconfig -c gp_external_enable_exec -v on --masteronly >& /dev/null
+20240605:10:16:54:017713 gpperfmon_install:xx:gpadmin-[INFO]:-gpperfmon will be enabled after a full restart of cloudberrydb
+-- end_ignore
+-- check cluster state
+\c postgres
+select pg_sleep(10);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+SELECT sync_state FROM pg_stat_get_wal_senders();
+ sync_state 
+------------
+ sync
+(1 row)
+
+\c contrib_regression
+select
+	case
+		when setting = 'on'  then 'perfmon is running'
+	else
+		'perfmon is not running'
+	end
+from pg_settings
+where name='perfmon.enable';
+        case        
+--------------------
+ perfmon is running
+(1 row)
+
diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index e69de29bb2d..f4a3e920179 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -0,0 +1,72 @@
+-- start_ignore
+select sess_id from pg_stat_activity where pg_backend_pid()=pid;
+ sess_id 
+---------
+      26
+(1 row)
+
+\gset
+select pg_sleep(30);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- end_ignore
+\c gpperfmon
+select pg_sleep(100);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+select count(*) from system_now;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from database_now;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) from diskspace_now;
+ count 
+-------
+     1
+(1 row)
+
+select count(*) > 0 from system_history;
+NOTICE:  One or more columns in the following table(s) do not have statistics: system_history
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
+ ?column? 
+----------
+ t
+(1 row)
+
+select count(*) > 0 from database_history;
+NOTICE:  One or more columns in the following table(s) do not have statistics: database_history
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
+ ?column? 
+----------
+ t
+(1 row)
+
+select count(*) > 0 from diskspace_history;
+NOTICE:  One or more columns in the following table(s) do not have statistics: diskspace_history
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
+ ?column? 
+----------
+ t
+(1 row)
+
+select status, query_text from queries_history where ssid = :sess_id;
+NOTICE:  One or more columns in the following table(s) do not have statistics: queries_history
+HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
+ status |      query_text      
+--------+----------------------
+ done   | select pg_sleep(30);
+(1 row)
+
diff --git a/contrib/perfmon/gpmon_catqrynow.py b/contrib/perfmon/gpmon_catqrynow.py
index 676889b8e79..0b7932fb0a8 100644
--- a/contrib/perfmon/gpmon_catqrynow.py
+++ b/contrib/perfmon/gpmon_catqrynow.py
@@ -45,4 +45,4 @@
         line[-3] = '"' + appname + '"'
         line[-2] = '"' + rsqname + '"'
         line[-1] = '"' + priority + '"'
-    print '|'.join(line).strip()
+    print('|'.join(line).strip())
diff --git a/contrib/perfmon/gpperfmon.sql b/contrib/perfmon/gpperfmon.sql
index 81f93a7a1c6..b234da121b4 100644
--- a/contrib/perfmon/gpperfmon.sql
+++ b/contrib/perfmon/gpperfmon.sql
@@ -87,7 +87,7 @@ partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVE
 
 create external web table public.queries_now (
         like public.queries_history
-) execute 'python $GPHOME/sbin/gpmon_catqrynow.py 2> /dev/null || true' on master format 'csv' (delimiter '|' NULL as 'null');
+) execute 'python3 $GPHOME/sbin/gpmon_catqrynow.py 2> /dev/null || true' on master format 'csv' (delimiter '|' NULL as 'null');
 
 create external web table public.queries_now_fast (
        ctime timestamptz(0),
@@ -149,7 +149,7 @@ create external web table public._database_tail (
 
 
 create external web table public.master_data_dir (hostname text, dir text)
-execute E'python -c "import socket, os; print socket.gethostname() + \\"|\\" + os.getcwd()"' on master
+execute E'python3 -c "import socket, os; print(socket.gethostname(), \\"|\\", os.getcwd())"' on master
 format 'csv' (delimiter '|');
 
 
diff --git a/contrib/perfmon/sql/post_run.sql b/contrib/perfmon/sql/post_run.sql
new file mode 100644
index 00000000000..f02327ff7c2
--- /dev/null
+++ b/contrib/perfmon/sql/post_run.sql
@@ -0,0 +1,9 @@
+-- start_ignore
+\! gpconfig -c perfmon.enable -v 'off'
+\! gpstop -ari
+-- end_ignore
+\! gpconfig -s perfmon.enable
+-- start_ignore
+\c contrib_regression
+drop database if exists gpperfmon;
+-- end_ignore
diff --git a/contrib/perfmon/sql/pre_run_check.sql b/contrib/perfmon/sql/pre_run_check.sql
new file mode 100644
index 00000000000..103b391716e
--- /dev/null
+++ b/contrib/perfmon/sql/pre_run_check.sql
@@ -0,0 +1,18 @@
+-- start_ignore
+drop database if exists gpperfmon;
+\! gpperfmon_install --enable --port $PGPORT --password 123
+\! gpstop -ari
+-- end_ignore
+-- check cluster state
+\c postgres
+select pg_sleep(10);
+SELECT sync_state FROM pg_stat_get_wal_senders();
+\c contrib_regression
+select
+	case
+		when setting = 'on'  then 'perfmon is running'
+	else
+		'perfmon is not running'
+	end
+from pg_settings
+where name='perfmon.enable';
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index 537a56fd01d..dce40c40c5b 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -1,9 +1,15 @@
+-- start_ignore
 select sess_id from pg_stat_activity where pg_backend_pid()=pid;
 \gset
-create table test(a int);
-select * from test;
-select pg_sleep(18);
+select pg_sleep(30);
+-- end_ignore
+
 \c gpperfmon
-select ssid, pid, ccnt, status, query_text from queries_now where ssid = :sess_id;
-\c contrib_regression
-drop table test;
+select pg_sleep(100);
+select count(*) from system_now;
+select count(*) from database_now;
+select count(*) from diskspace_now;
+select count(*) > 0 from system_history;
+select count(*) > 0 from database_history;
+select count(*) > 0 from diskspace_history;
+select status, query_text from queries_history where ssid = :sess_id;

From 65b6caff859e9e3c264491a520bd37610a0d3500 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Wed, 19 Jun 2024 10:43:56 +0800
Subject: [PATCH 05/40] Fix regress test Cluster validation failed: standby
 replication state and enable perfmon test

Standby crashes after running test:local_directory_table_mixed and
restart.
standby failed to replay the create tablespace log as the tablesapce
directory has been removed in the test.
---
 GNUmakefile.in                                     |  3 +++
 contrib/perfmon/Makefile                           |  4 ----
 contrib/perfmon/expected/pre_run_check.out         | 14 --------------
 contrib/perfmon/expected/query.out                 | 12 ++++--------
 contrib/perfmon/sql/pre_run_check.sql              |  4 ----
 contrib/perfmon/sql/query.sql                      |  4 ++++
 .../input/local_directory_table_mixed.source       |  2 +-
 7 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/GNUmakefile.in b/GNUmakefile.in
index c7c86cfed14..2cb17833f83 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -200,6 +200,9 @@ ifeq ($(with_openssl), yes)
 ICW_TARGETS += contrib/sslinfo
 endif
 ICW_TARGETS += gpcontrib gpMgmt/bin
+ifeq ($(enable_perfmon), yes)
+ICW_TARGETS += contrib/perfmon
+endif
 
 $(call recurse,installcheck-world,src/test src/pl src/interfaces/ecpg $(ICW_TARGETS) src/bin,installcheck)
 $(call recurse,install-tests,src/test/regress,install-tests)
diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index ecc6a0283db..dfb495ec41b 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -41,7 +41,3 @@ install:  installdirs
 	mkdir -p '$(DESTDIR)$(libdir)/$(NAME)'
 	$(INSTALL_SCRIPT) $(NAME).sql '$(DESTDIR)$(libdir)/$(NAME)'
 	$(INSTALL_SCRIPT) $(NAME).conf '$(DESTDIR)$(libdir)/$(NAME)'
-
-.PHONY: icw_test
-	icw_test:
-	    make -C $(REGRESS_DIR) icw_test
diff --git a/contrib/perfmon/expected/pre_run_check.out b/contrib/perfmon/expected/pre_run_check.out
index e89807fb148..1306287dd96 100644
--- a/contrib/perfmon/expected/pre_run_check.out
+++ b/contrib/perfmon/expected/pre_run_check.out
@@ -21,20 +21,6 @@ drop database if exists gpperfmon;
 20240605:10:16:54:017713 gpperfmon_install:xx:gpadmin-[INFO]:-PGPORT=7000 gpconfig -c gp_external_enable_exec -v on --masteronly >& /dev/null
 20240605:10:16:54:017713 gpperfmon_install:xx:gpadmin-[INFO]:-gpperfmon will be enabled after a full restart of cloudberrydb
 -- end_ignore
--- check cluster state
-\c postgres
-select pg_sleep(10);
- pg_sleep 
-----------
- 
-(1 row)
-
-SELECT sync_state FROM pg_stat_get_wal_senders();
- sync_state 
-------------
- sync
-(1 row)
-
 \c contrib_regression
 select
 	case
diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index f4a3e920179..4069e342418 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -20,6 +20,10 @@ select pg_sleep(100);
  
 (1 row)
 
+analyze system_history;
+analyze database_history;
+analyze diskspace_history;
+analyze queries_history;
 select count(*) from system_now;
  count 
 -------
@@ -39,32 +43,24 @@ select count(*) from diskspace_now;
 (1 row)
 
 select count(*) > 0 from system_history;
-NOTICE:  One or more columns in the following table(s) do not have statistics: system_history
-HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
  ?column? 
 ----------
  t
 (1 row)
 
 select count(*) > 0 from database_history;
-NOTICE:  One or more columns in the following table(s) do not have statistics: database_history
-HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
  ?column? 
 ----------
  t
 (1 row)
 
 select count(*) > 0 from diskspace_history;
-NOTICE:  One or more columns in the following table(s) do not have statistics: diskspace_history
-HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
  ?column? 
 ----------
  t
 (1 row)
 
 select status, query_text from queries_history where ssid = :sess_id;
-NOTICE:  One or more columns in the following table(s) do not have statistics: queries_history
-HINT:  For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
  status |      query_text      
 --------+----------------------
  done   | select pg_sleep(30);
diff --git a/contrib/perfmon/sql/pre_run_check.sql b/contrib/perfmon/sql/pre_run_check.sql
index 103b391716e..81465ab6f1f 100644
--- a/contrib/perfmon/sql/pre_run_check.sql
+++ b/contrib/perfmon/sql/pre_run_check.sql
@@ -3,10 +3,6 @@ drop database if exists gpperfmon;
 \! gpperfmon_install --enable --port $PGPORT --password 123
 \! gpstop -ari
 -- end_ignore
--- check cluster state
-\c postgres
-select pg_sleep(10);
-SELECT sync_state FROM pg_stat_get_wal_senders();
 \c contrib_regression
 select
 	case
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index dce40c40c5b..718827320d1 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -6,6 +6,10 @@ select pg_sleep(30);
 
 \c gpperfmon
 select pg_sleep(100);
+analyze system_history;
+analyze database_history;
+analyze diskspace_history;
+analyze queries_history;
 select count(*) from system_now;
 select count(*) from database_now;
 select count(*) from diskspace_now;
diff --git a/src/test/isolation2/input/local_directory_table_mixed.source b/src/test/isolation2/input/local_directory_table_mixed.source
index 84afcd8fbc0..847e35f943b 100644
--- a/src/test/isolation2/input/local_directory_table_mixed.source
+++ b/src/test/isolation2/input/local_directory_table_mixed.source
@@ -132,4 +132,4 @@ DROP TABLESPACE directory_tblspc;
 DROP DIRECTORY TABLE dir_table1 WITH CONTENT;
 DROP DIRECTORY TABLE dir_table2 WITH CONTENT;
 DROP DIRECTORY TABLE dir_table3 WITH CONTENT;
-DROP TABLESPACE directory_tblspc;
\ No newline at end of file
+DROP TABLESPACE directory_tblspc;

From 9bd5af7511616631a8f1b68fad39ec00e553a389 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 21 Jun 2024 10:05:19 +0800
Subject: [PATCH 06/40] Fix regress test

When there more than one mount points in the system
the test failed. Just to fix that.
---
 contrib/perfmon/expected/guc_config.out |  6 ------
 contrib/perfmon/expected/query.out      | 24 ++++++++++++------------
 contrib/perfmon/sql/guc_config.sql      |  1 -
 contrib/perfmon/sql/query.sql           |  6 +++---
 4 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/contrib/perfmon/expected/guc_config.out b/contrib/perfmon/expected/guc_config.out
index 11e798c6420..6c1056f5865 100644
--- a/contrib/perfmon/expected/guc_config.out
+++ b/contrib/perfmon/expected/guc_config.out
@@ -58,12 +58,6 @@ select wait_for_gpsmon_work();
  
 (1 row)
 
-select count(*) from diskspace_now;
- count 
--------
-     1
-(1 row)
-
 \! netstat -anp | grep udp | grep gpsmon | wc -l
 (Not all processes could be identified, non-owned process info
  will not be shown, you would have to be root to see it all.)
diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index 4069e342418..0cfcc609540 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -24,22 +24,22 @@ analyze system_history;
 analyze database_history;
 analyze diskspace_history;
 analyze queries_history;
-select count(*) from system_now;
- count 
--------
-     1
+select count(*) > 0 from system_now;
+ ?column? 
+----------
+ t
 (1 row)
 
-select count(*) from database_now;
- count 
--------
-     1
+select count(*) > 0 from database_now;
+ ?column? 
+----------
+ t
 (1 row)
 
-select count(*) from diskspace_now;
- count 
--------
-     1
+select count(*) > 0 from diskspace_now;
+ ?column? 
+----------
+ t
 (1 row)
 
 select count(*) > 0 from system_history;
diff --git a/contrib/perfmon/sql/guc_config.sql b/contrib/perfmon/sql/guc_config.sql
index 40d3a5bfd02..177611b45aa 100644
--- a/contrib/perfmon/sql/guc_config.sql
+++ b/contrib/perfmon/sql/guc_config.sql
@@ -38,6 +38,5 @@ BEGIN
 END
 $$ LANGUAGE plpgsql;
 select wait_for_gpsmon_work(); 
-select count(*) from diskspace_now;
 \! netstat -anp | grep udp | grep gpsmon | wc -l
 \! ps -ef | grep gpsmon | grep -v grep | wc -l
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index 718827320d1..77fd142fa76 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -10,9 +10,9 @@ analyze system_history;
 analyze database_history;
 analyze diskspace_history;
 analyze queries_history;
-select count(*) from system_now;
-select count(*) from database_now;
-select count(*) from diskspace_now;
+select count(*) > 0 from system_now;
+select count(*) > 0 from database_now;
+select count(*) > 0 from diskspace_now;
 select count(*) > 0 from system_history;
 select count(*) > 0 from database_history;
 select count(*) > 0 from diskspace_history;

From 7a643a21d683232d2790830c31dc7e4f0ed2baef Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Mon, 24 Jun 2024 14:00:34 +0800
Subject: [PATCH 07/40] perfmon: pass hostname to gpsmon

Add -h option to gpsmon command, then it uses
the passed hostname as the localhostname.

This can enable the database to check which
machine monitored by gpsmon is master by
comparing the hostname in gp_segment_configuration.
---
 contrib/perfmon/src/gpmmon/gpmmon.c |  9 +++++----
 contrib/perfmon/src/gpsmon/gpsmon.c | 28 ++++++++++++++++++++--------
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
index 19c88379bc3..0af4928a30a 100644
--- a/contrib/perfmon/src/gpmmon/gpmmon.c
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -630,14 +630,15 @@ static void* conm_main(apr_thread_t* thread_, void* arg_)
 
 					if (h->smon_bin_location) { //if this if filled, then use it as the directory for smon istead of the default
 						snprintf(line, line_size, "ssh -v -o 'BatchMode yes' -o 'StrictHostKeyChecking no'"
-								" %s '%s echo -e \"%" APR_INT64_T_FMT "\\n\\n\" | %s -m %" FMT64 " -t %" FMT64 " -l %s%s -v %d %d' 2>&1",
-								active_hostname, kill_gpsmon, ax.signature, h->smon_bin_location, opt.max_log_size, smon_terminate_timeout, ptr_smon_log_location, ptr_smon_log_location_suffix, opt.v, ax.port);
+								" %s '%s echo -e \"%" APR_INT64_T_FMT "\\n\\n\" | %s -m %" FMT64 " -t %" FMT64 " -l %s%s -v %d -h %s %d' 2>&1",
+								active_hostname, kill_gpsmon, ax.signature, h->smon_bin_location, opt.max_log_size, smon_terminate_timeout, ptr_smon_log_location, ptr_smon_log_location_suffix, opt.v, active_hostname, ax.port);
 					} else {
 						snprintf(line, line_size, "ssh -v -o 'BatchMode yes' -o 'StrictHostKeyChecking no'"
-								" %s '%s echo -e \"%" APR_INT64_T_FMT "\\n\\n\" | %s/bin/gpsmon -m %" FMT64 " -t %" FMT64 " -l %s%s -v %d %d' 2>&1",
-								active_hostname, kill_gpsmon, ax.signature, ax.gphome, opt.max_log_size, smon_terminate_timeout, ptr_smon_log_location, ptr_smon_log_location_suffix, opt.v, ax.port);
+								" %s '%s echo -e \"%" APR_INT64_T_FMT "\\n\\n\" | %s/bin/gpsmon -m %" FMT64 " -t %" FMT64 " -l %s%s -v %d -h %s %d' 2>&1",
+								active_hostname, kill_gpsmon, ax.signature, ax.gphome, opt.max_log_size, smon_terminate_timeout, ptr_smon_log_location, ptr_smon_log_location_suffix, opt.v, active_hostname, ax.port) ;
 
 					}
+					TR0(("Command to start gpsmon %s", line));
 
 					if (h->ever_connected)
 					{
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index 1714cc531a4..a478267e5d0 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -44,6 +44,7 @@ static struct
 	// The timeout in seconds for smon to restart if no requests
 	// come during that period.
 	apr_uint64_t terminate_timeout;
+	const char*	hostname;
 } opt = { 0 };
 
 int verbose = 0; /* == opt.v */
@@ -1373,19 +1374,25 @@ static void setup_udp()
 
 static const char* get_and_allocate_hostname()
 {
-	char hname[256] = { 0 };
-
-	if (gethostname(hname, sizeof(hname) - 1))
+	if (opt.hostname)
 	{
-		gx.hostname = strdup("unknown");
-		gpmon_warningx(FLINE, 0, "gethostname failed");
+		gx.hostname = strdup(opt.hostname);
 	}
 	else
 	{
-		hname[sizeof(hname) - 1] = 0;
-		gx.hostname = strdup(hname);
-	}
+		char hname[256] = { 0 };
 
+		if (gethostname(hname, sizeof(hname) - 1))
+		{
+			gx.hostname = strdup("unknown");
+			gpmon_warningx(FLINE, 0, "gethostname failed");
+		}
+		else
+		{
+			hname[sizeof(hname) - 1] = 0;
+			gx.hostname = strdup(hname);
+		}
+	}
 	return gx.hostname;
 }
 
@@ -1643,6 +1650,7 @@ static void usage(const char* msg)
 	fprintf(stdout, "\t-t:\tterminate timeout\n"),
 	fprintf(stdout, "\t-a:\titerator aggregate\n");
 	fprintf(stdout, "\t-i:\tignore qexec packet\n");
+	fprintf(stdout, "\t-h:\thostname for this machine\n");
 	if (msg)
 		fprintf(stdout, "%s\n\n", msg);
 
@@ -1666,6 +1674,7 @@ static void parse_command_line(int argc, const char* const argv[])
 	{ NULL, 't', 1, "terminate timeout" },
 	{ NULL, 'a', 0, "iterator aggregate" },
 	{ NULL, 'i', 0, "ignore qexec packet" },
+	{ NULL, 'h', 1, "hostname for this machine" },
 	{ NULL, 0, 0, NULL } };
 	apr_pool_t* pool;
 
@@ -1712,6 +1721,9 @@ static void parse_command_line(int argc, const char* const argv[])
 		case 't':
 			opt.terminate_timeout = apr_atoi64(arg);
 			break;
+		case 'h':
+			opt.hostname = strdup(arg); 
+			break;
 		}
 	}
 

From a766f8146640ddca5e2857c29de37319fdbc3d9b Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 28 Jun 2024 17:18:48 +0800
Subject: [PATCH 08/40] perfmon extension

Enable to create perfmon extension in gpperfmon database
---
 contrib/perfmon/Makefile                       | 18 +++++++++++-------
 contrib/perfmon/expected/extension_test.out    | 16 ++++++++++++++++
 contrib/perfmon/gpperfmon_install              |  6 +++---
 contrib/perfmon/perfmon.control                |  5 +++++
 contrib/perfmon/{gpperfmon.sql => perfmon.sql} | 18 +++++++++++++++++-
 contrib/perfmon/sql/extension_test.sql         |  4 ++++
 contrib/perfmon/sql/guc_config.sql             |  1 +
 7 files changed, 57 insertions(+), 11 deletions(-)
 create mode 100644 contrib/perfmon/expected/extension_test.out
 create mode 100644 contrib/perfmon/perfmon.control
 rename contrib/perfmon/{gpperfmon.sql => perfmon.sql} (97%)
 create mode 100644 contrib/perfmon/sql/extension_test.sql

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index dfb495ec41b..6908da820e3 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -1,6 +1,7 @@
-NAME = gpperfmon
+NAME = perfmon
+EXTVERSION = 1.0.0
 
-REGRESS = pre_run_check guc_config query post_run
+REGRESS = pre_run_check guc_config query extension_test post_run
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
@@ -13,6 +14,9 @@ include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
 
+$(NAME)--$(EXTVERSION).sql: $(NAME).sql
+	    cp $< $@
+
 .PHONY: clean_perfmon
 clean_perfmon:
 	$(MAKE) -C src/gpmon clean
@@ -23,12 +27,12 @@ clean_perfmon:
 	rm -rf gpperfmon.so
 clean distclean: clean_perfmon
 
-all:
+all: $(NAME)--$(EXTVERSION).sql
 	$(MAKE) -C src/gpmon all
 	$(MAKE) -C src/gpmmon all
 	$(MAKE) -C src/gpsmon all
 
-.PHONY: installdirs
+.dir)/extension/PHONY: installdirs
 installdirs:
 	$(MKDIR_P) '$(DESTDIR)$(bindir)/../sbin'
 install:  installdirs
@@ -38,6 +42,6 @@ install:  installdirs
 	$(INSTALL_SCRIPT) gpperfmon_install '$(bindir)'
 	$(INSTALL_SCRIPT) gpperfmoncat.sh '$(DESTDIR)$(bindir)'
 	$(INSTALL_SCRIPT) gpmon_catqrynow.py '$(DESTDIR)$(bindir)/../sbin/'
-	mkdir -p '$(DESTDIR)$(libdir)/$(NAME)'
-	$(INSTALL_SCRIPT) $(NAME).sql '$(DESTDIR)$(libdir)/$(NAME)'
-	$(INSTALL_SCRIPT) $(NAME).conf '$(DESTDIR)$(libdir)/$(NAME)'
+	$(INSTALL_SCRIPT) gpperfmon.conf '$(DESTDIR)$(datadir)'
+	$(INSTALL_SCRIPT) $(NAME)--$(EXTVERSION).sql '$(DESTDIR)$(datadir)/extension/'
+	$(INSTALL_SCRIPT) $(NAME).control '$(DESTDIR)$(datadir)/extension/'
diff --git a/contrib/perfmon/expected/extension_test.out b/contrib/perfmon/expected/extension_test.out
new file mode 100644
index 00000000000..0a237b65443
--- /dev/null
+++ b/contrib/perfmon/expected/extension_test.out
@@ -0,0 +1,16 @@
+create extension perfmon;
+ERROR:  "perfmon" extension can only be created in gpperfmon database
+CONTEXT:  PL/pgSQL function checkdbname() line 7 at RAISE
+select count(*) from pg_extension where extname = 'perfmon';
+ count 
+-------
+     0
+(1 row)
+
+\c gpperfmon
+select count(*) from pg_extension where extname = 'perfmon';
+ count 
+-------
+     1
+(1 row)
+
diff --git a/contrib/perfmon/gpperfmon_install b/contrib/perfmon/gpperfmon_install
index fab55f5f13e..681eb90fe01 100755
--- a/contrib/perfmon/gpperfmon_install
+++ b/contrib/perfmon/gpperfmon_install
@@ -130,7 +130,7 @@ if __name__ == '__main__':
     cmd = Command("createdb gpperfmon")
     commands.append(cmd)
 
-    cmd = Command("PGPORT=%d psql -f %s/lib/gpperfmon/gpperfmon.sql gpperfmon" % (options.port, gphome))
+    cmd = Command("PGPORT=%d psql -c 'create extension perfmon' gpperfmon" % (options.port))
     commands.append(cmd)
 
     if options.enable:
@@ -140,7 +140,7 @@ if __name__ == '__main__':
         home_dir = os.getenv('HOME')
         gpperfmon_conf_dir = "%s/gpperfmon/conf" % coordinatordata_dir
         gpperfmon_conf_file = "%s/gpperfmon.conf" % gpperfmon_conf_dir
-        gpperfmon_conf_file_src = "%s/lib/gpperfmon/gpperfmon.conf" % gphome
+        gpperfmon_conf_file_src = "%s/share/postgresql/gpperfmon.conf" % gphome
 
         if not coordinatordata_dir:
             logger.error("COORDINATOR_DATA_DIRECTORY must be set")
@@ -170,7 +170,7 @@ if __name__ == '__main__':
         #    sys.exit(1)
 
         if not os.path.isfile(gpperfmon_conf_file_src):
-            logger.error(" gpperfmon.conf doesn't exist in %s/lib/gpperfmon" % gphome) 
+            logger.error(" gpperfmon.conf doesn't exist in %s/share/postgresql" % gphome) 
             sys.exit(1)
 
         cmd = Command("""PGPORT=%d psql template1 -c "DROP ROLE IF EXISTS gpmon" """ % options.port)
diff --git a/contrib/perfmon/perfmon.control b/contrib/perfmon/perfmon.control
new file mode 100644
index 00000000000..6a2e612ce58
--- /dev/null
+++ b/contrib/perfmon/perfmon.control
@@ -0,0 +1,5 @@
+comment = 'data type for storing sets of (key, value) pairs'
+default_version = '1.0.0'
+module_pathname = '$libdir/perfmon'
+relocatable = true
+trusted = true
diff --git a/contrib/perfmon/gpperfmon.sql b/contrib/perfmon/perfmon.sql
similarity index 97%
rename from contrib/perfmon/gpperfmon.sql
rename to contrib/perfmon/perfmon.sql
index b234da121b4..9ae3a530c0c 100644
--- a/contrib/perfmon/gpperfmon.sql
+++ b/contrib/perfmon/perfmon.sql
@@ -1,3 +1,20 @@
+-- Only can be installed in gpperfmon databse
+CREATE OR REPLACE FUNCTION checkdbname() 
+RETURNS void
+AS $$
+DECLARE 
+dbname varchar; 
+BEGIN 
+	select current_database()  into strict dbname;
+	IF dbname != 'gpperfmon' THEN 
+		    RAISE EXCEPTION '"perfmon" extension can only be created in gpperfmon database'; 
+	END IF; 
+END;
+$$
+LANGUAGE plpgsql;
+
+select checkdbname();
+
 --  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 --  Gpperfmon Schema
 
@@ -11,7 +28,6 @@
 --  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 --  system
 --
-\c gpperfmon;
 
 create table public.system_history (
        ctime timestamptz(0) not null, -- record creation time
diff --git a/contrib/perfmon/sql/extension_test.sql b/contrib/perfmon/sql/extension_test.sql
new file mode 100644
index 00000000000..d5d66d2ee51
--- /dev/null
+++ b/contrib/perfmon/sql/extension_test.sql
@@ -0,0 +1,4 @@
+create extension perfmon;
+select count(*) from pg_extension where extname = 'perfmon';
+\c gpperfmon
+select count(*) from pg_extension where extname = 'perfmon';
diff --git a/contrib/perfmon/sql/guc_config.sql b/contrib/perfmon/sql/guc_config.sql
index 177611b45aa..04bd59903d2 100644
--- a/contrib/perfmon/sql/guc_config.sql
+++ b/contrib/perfmon/sql/guc_config.sql
@@ -1,6 +1,7 @@
 -- Disable perfmon.enable
 -- start_ignore
 \! gpconfig -c perfmon.enable -v false
+\! pkill gpsmon
 \! gpstop -ari
 -- end_ignore
 \! ps -ef | grep '\[gpmmon\]' | wc -l

From 2c7b80d470e1ddc3d2bbb9355ab27ff7886fd83b Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Mon, 1 Jul 2024 17:10:42 +0800
Subject: [PATCH 09/40] perfmon: Add plan column to queries_history

Move "query_info_collect_hook" from the beginning of
"standard_ExecutorStart" to the end to let the Estate
created and filled.

Add gpperfmon.conf in sql/ folder which is used
to config perfmon for regress
---
 contrib/perfmon/.gitignore             |   1 +
 contrib/perfmon/expected/query.out     |  30 +++++---
 contrib/perfmon/gpmon_catqrynow.py     |   9 +++
 contrib/perfmon/gpperfmon_install      |   6 +-
 contrib/perfmon/sql/gpperfmon.conf     |  50 ++++++++++++
 contrib/perfmon/sql/pre_run_check.sql  |   2 +
 contrib/perfmon/sql/query.sql          |  37 ++++++++-
 contrib/perfmon/src/gpmmon/gpmon_agg.c |  10 ++-
 contrib/perfmon/src/gpmon/gpmon.c      | 101 +++++++++++++++++++------
 contrib/perfmon/src/include/gpmon.h    |   4 +-
 src/backend/executor/execMain.c        |   3 +
 11 files changed, 212 insertions(+), 41 deletions(-)
 create mode 100644 contrib/perfmon/sql/gpperfmon.conf

diff --git a/contrib/perfmon/.gitignore b/contrib/perfmon/.gitignore
index f201ec88242..310917de6ed 100644
--- a/contrib/perfmon/.gitignore
+++ b/contrib/perfmon/.gitignore
@@ -5,3 +5,4 @@
 ./src/gpmmon/gpmmon.so
 ./src/gpmon/gpmon.so
 ./gpperfmon--?.?.?.sql
+./results
diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index 0cfcc609540..dc5c09cb861 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -6,13 +6,23 @@ select sess_id from pg_stat_activity where pg_backend_pid()=pid;
 (1 row)
 
 \gset
-select pg_sleep(30);
- pg_sleep 
-----------
- 
+-- end_ignore
+CREATE TABLE foo(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE test(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO foo SELECT generate_series(0,10);
+INSERT INTO test SELECT generate_series(0,10);
+select count(*) from foo,test where foo.a=test.a;
+ count 
+-------
+    11
 (1 row)
 
--- end_ignore
+DROP TABLE foo;
+DROP TABLE test;
 \c gpperfmon
 select pg_sleep(100);
  pg_sleep 
@@ -60,9 +70,11 @@ select count(*) > 0 from diskspace_history;
  t
 (1 row)
 
-select status, query_text from queries_history where ssid = :sess_id;
- status |      query_text      
---------+----------------------
- done   | select pg_sleep(30);
+select status, query_text, length(query_plan) > 0 from queries_history
+where ssid = :sess_id and 
+query_text = 'select count(*) from foo,test where foo.a=test.a;';
+ status |                    query_text                     | ?column? 
+--------+---------------------------------------------------+----------
+ done   | select count(*) from foo,test where foo.a=test.a; | t
 (1 row)
 
diff --git a/contrib/perfmon/gpmon_catqrynow.py b/contrib/perfmon/gpmon_catqrynow.py
index 0b7932fb0a8..452e80a36c6 100644
--- a/contrib/perfmon/gpmon_catqrynow.py
+++ b/contrib/perfmon/gpmon_catqrynow.py
@@ -12,6 +12,7 @@
     line = line.split('|')
     (tmid, xid, cid) = line[1:4]
     qrytxt = ''
+    plan = ''
     appname = ''
     rsqname = ''
     priority = ''
@@ -21,9 +22,14 @@
         meta = fp.readline().split(' ')
         qrytxt = fp.read(int(meta[0])).strip()
 
+        newline = fp.readline()
+        meta = fp.readline().split(' ')
+        plan = fp.read(int(meta[0])).strip()
+
         newline = fp.readline()
         meta = fp.readline().split(' ')
         appname = fp.read(int(meta[0])).strip()
+
         
         newline = fp.readline()
         meta = fp.readline().split(' ')
@@ -34,6 +40,7 @@
         priority = fp.read(int(meta[0])).strip()
 
         fp.close()
+
     except:
         qrytxt = "Query text unavailable"
         if fp: fp.close()
@@ -42,6 +49,8 @@
     if qrytxt:
         qrytxt = '""'.join(qrytxt.split('"'))
         line[-5] = '"' + qrytxt + '"'
+        plan = '""'.join(plan.split('"'))
+        line[-4] = '"' + plan + '"'
         line[-3] = '"' + appname + '"'
         line[-2] = '"' + rsqname + '"'
         line[-1] = '"' + priority + '"'
diff --git a/contrib/perfmon/gpperfmon_install b/contrib/perfmon/gpperfmon_install
index 681eb90fe01..9f43ac92d23 100755
--- a/contrib/perfmon/gpperfmon_install
+++ b/contrib/perfmon/gpperfmon_install
@@ -165,9 +165,9 @@ if __name__ == '__main__':
             logger.error("can not find $HOME")
             sys.exit(1)
 
-        #if os.path.isfile(gpperfmon_conf_file):
-        #    logger.error(" gpperfmon.conf already exists %s" % gpperfmon_conf_file) 
-        #    sys.exit(1)
+        if os.path.isfile(gpperfmon_conf_file):
+            logger.error(" gpperfmon.conf already exists %s" % gpperfmon_conf_file)
+            sys.exit(1)
 
         if not os.path.isfile(gpperfmon_conf_file_src):
             logger.error(" gpperfmon.conf doesn't exist in %s/share/postgresql" % gphome) 
diff --git a/contrib/perfmon/sql/gpperfmon.conf b/contrib/perfmon/sql/gpperfmon.conf
new file mode 100644
index 00000000000..b16b34090fa
--- /dev/null
+++ b/contrib/perfmon/sql/gpperfmon.conf
@@ -0,0 +1,50 @@
+[GPMMON]
+# quantum specifies the time in seconds between updates from
+# performance monitor agents on all segments. Valid values
+# are 5, 10, 15, 20, 30, or 60
+quantum = 5
+
+# min_query_time specifies the minimum query run time
+# in seconds for statistics collection. The monitor logs all
+# queries that run longer than this value in the queries_history
+# table. For queries with shorter run times, no historical
+# data is collected.
+min_query_time = 0
+
+# This should be a percentage between 0 and 100 and should be
+# less than the error_disk_space_percentage.  If a filesystem's
+# disk space used percentage equals or exceeds this value a
+# warning will be logged and a warning email/snmp trap may be
+# sent.  If this configuration is set to 0 or not specified, no
+# warnings are sent.
+#warning_disk_space_percentage = 80
+
+# This should be a percentage between 0 and 100 and should be
+# greater than the warning_disk_space_percentage. If a
+# filesystem's disk space used percentage equals or exceeds
+# this value an error will be logged and a error email/snmp
+# trap may be sent.  If this configuration is set to 0 or not
+# specified, no errors are sent.
+#error_disk_space_percentage = 90
+
+#This is the interval in minutes that limits the number of
+#error/warning messages that are sent. The minimum value for
+#this configuration is 1.  Setting this to 0 or not specifying
+#this configuration results in it getting set to the minimum.
+disk_space_interval = 60
+
+#This is the maximum number of error/warning messages that
+#will be sent in the disk_space_interval.  The maximum value
+#for this configuration is 50.  The minimum value for this
+#configuration is 1.  Setting this configuration to greater
+#than 50 or not specifying this configuration results in it
+#getting set to the maximum.
+max_disk_space_messages_per_interval = 10
+
+# The number of partitions for statistics data in month
+# will be retained. Older partitions will be dropped.
+#partition_age = 6
+
+log_location = gpperfmon/logs
+
+harvest_interval = 30
diff --git a/contrib/perfmon/sql/pre_run_check.sql b/contrib/perfmon/sql/pre_run_check.sql
index 81465ab6f1f..30abca1e535 100644
--- a/contrib/perfmon/sql/pre_run_check.sql
+++ b/contrib/perfmon/sql/pre_run_check.sql
@@ -1,6 +1,8 @@
 -- start_ignore
 drop database if exists gpperfmon;
+\! rm -rf $COORDINATOR_DATA_DIRECTORY/gpperfmon/conf/gpperfmon.conf
 \! gpperfmon_install --enable --port $PGPORT --password 123
+\! cp sql/gpperfmon.conf $COORDINATOR_DATA_DIRECTORY/gpperfmon/conf/
 \! gpstop -ari
 -- end_ignore
 \c contrib_regression
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index 77fd142fa76..412ac44ea01 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -1,9 +1,39 @@
 -- start_ignore
+-- wait a while as sometimes the gpmmon is not ready
+\c gpperfmon
+CREATE OR REPLACE FUNCTION wait_for_gpmmon_work() RETURNS void AS $$
+DECLARE
+DECLARE
+start_time timestamptz := clock_timestamp();
+updated bool;
+BEGIN
+	-- we don't want to wait forever; loop will exit after 60 seconds
+	FOR i IN 1 .. 1000 LOOP
+		SELECT(SELECT count(*) > 0 from queries_history ) INTO updated;
+		EXIT WHEN updated;
+
+		-- wait a little
+		PERFORM pg_sleep_for('100 milliseconds');
+	END LOOP;
+	-- report time waited in postmaster log (where it won't change test output)
+	RAISE log 'wait_for_gpmmon_work delayed % seconds',
+	EXTRACT(epoch FROM clock_timestamp() - start_time);
+END
+$$ LANGUAGE plpgsql;
+select wait_for_gpmmon_work();
+\c contrib_regression
 select sess_id from pg_stat_activity where pg_backend_pid()=pid;
 \gset
-select pg_sleep(30);
 -- end_ignore
 
+CREATE TABLE foo(a int);
+CREATE TABLE test(a int);
+INSERT INTO foo SELECT generate_series(0,10);
+INSERT INTO test SELECT generate_series(0,10);
+select count(*) from foo,test where foo.a=test.a;
+DROP TABLE foo;
+DROP TABLE test;
+
 \c gpperfmon
 select pg_sleep(100);
 analyze system_history;
@@ -16,4 +46,7 @@ select count(*) > 0 from diskspace_now;
 select count(*) > 0 from system_history;
 select count(*) > 0 from database_history;
 select count(*) > 0 from diskspace_history;
-select status, query_text from queries_history where ssid = :sess_id;
+
+select status, query_text, length(query_plan) > 0 from queries_history
+where ssid = :sess_id and 
+query_text = 'select count(*) from foo,test where foo.a=test.a;';
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 51ccc087750..2a85c1ca15d 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -509,6 +509,14 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 		apr_int32_t age = newagg->generation - dp->last_updated_generation - 1;
 		if (age > 0)
 		{
+			if (status == GPMON_QLOG_STATUS_DONE &&
+					((dp->qlog.tfin - dp->qlog.tstart) < min_query_time ))
+			{
+				TR2(("agg_dup: skip short query %d.%d.%d generation %d, current generation %d, recorded %d\n",
+							dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt,
+							(int) dp->last_updated_generation, (int) newagg->generation, dp->recorded));
+				continue;
+			}
 			if (  (status != GPMON_QLOG_STATUS_SUBMIT
 			       && status != GPMON_QLOG_STATUS_CANCELING
 			       && status != GPMON_QLOG_STATUS_START)
@@ -1509,7 +1517,7 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
 	    fprintf(fp, "|");
         bytes_written++;
 
-        if (!all_good || iter == 1){
+        if (!all_good){
             // we have no data for query plan
             // if we failed once already don't bother trying to parse query file
             continue;
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 9f0ffcc8388..840b864588c 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -25,6 +25,7 @@
 #include "utils/metrics_utils.h"
 #include "utils/metrics_utils.h"
 #include "utils/snapmgr.h"
+#include "commands/explain.h"
 
 PG_MODULE_MAGIC;
 static int32 init_tmid = -1;;
@@ -50,6 +51,8 @@ static void gpmon_query_info_collect_hook(QueryMetricsStatus status, void *query
 
 static gpmon_packet_t* gpmon_qlog_packet_init();
 static void init_gpmon_hooks(void);
+static char* get_plan(QueryDesc *queryDesc);
+static char* get_query_text(QueryDesc *queryDesc);
 
 struct  {
     int    gxsock;
@@ -255,11 +258,6 @@ void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket)
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_SUBMIT;
 	gpmonPacket->u.qlog.tsubmit = tv.tv_sec;
 	
-	//gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
-	//		gpmonPacket->u.qlog.key.ssid,
-	//		gpmonPacket->u.qlog.key.ccnt,
-	//		gpmonPacket->u.qlog.status);
-	//
 	gpmon_send(gpmonPacket);
 }
 
@@ -285,15 +283,18 @@ static const char* gpmon_null_subst(const char* input)
 
 void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
 		const char *queryText,
+		const char *plan,
 		const char *appName,
 		const char *resqName,
-		const char *resqPriority)
+		const char *resqPriority,
+		int status)
 {
 	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
 	char fname[GPMON_DIR_MAX_PATH];
 	FILE* fp;
 
 	queryText = gpmon_null_subst(queryText);
+	plan = gpmon_null_subst(plan);
 	appName = gpmon_null_subst(appName);
 	resqName = gpmon_null_subst(resqName);
 	resqPriority = gpmon_null_subst(resqPriority);
@@ -304,23 +305,22 @@ void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
 	Assert(resqPriority);
 
 
-	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, 
+	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR,
 										gpmonPacket->u.qlog.key.tmid,
 										gpmonPacket->u.qlog.key.ssid,
 										gpmonPacket->u.qlog.key.ccnt);
+	fp = fopen(fname, "w+");
 
-	fp = fopen(fname, "a");
 	if (!fp)
 		return;
-	gpmon_record_kv_with_file("qtext", queryText, false, fp);
 
+	gpmon_record_kv_with_file("qtext", queryText, false, fp);
+	gpmon_record_kv_with_file("plan", plan, false, fp);
 	gpmon_record_kv_with_file("appname", appName, false, fp);
-
 	gpmon_record_kv_with_file("resqname", resqName, false, fp);
-
 	gpmon_record_kv_with_file("priority", resqPriority, true, fp);
+	fprintf(fp, "%d", status);
 
-	fprintf(fp, "%d", GPMON_QLOG_STATUS_SUBMIT);
 	fclose(fp);
 
 }
@@ -338,12 +338,10 @@ void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket)
 	
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_START;
 	gpmonPacket->u.qlog.tstart = tv.tv_sec;
-	
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
 			gpmonPacket->u.qlog.key.ssid,
 			gpmonPacket->u.qlog.key.ccnt,
 			gpmonPacket->u.qlog.status);
-	
 	gpmon_send(gpmonPacket);
 }
 
@@ -412,6 +410,7 @@ static void
 gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 {
 	char *query_text;
+	char *plan;
 	QueryDesc *qd = (QueryDesc *)queryDesc;
 	if (perfmon_enabled
 			&& Gp_role == GP_ROLE_DISPATCH && qd != NULL)
@@ -426,21 +425,14 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 					gpmon_qlog_query_start(gpmonPacket);
 					break;
 				case METRICS_QUERY_SUBMIT:
-					/* convert to UTF8 which is encoding for gpperfmon database */
-					query_text = (char *)qd->sourceText;
-					/**
-					 * When client encoding and server encoding are different, do apply the conversion.
-					 */
-					if (GetDatabaseEncoding() != pg_get_client_encoding())
-					{
-						query_text = (char *)pg_do_encoding_conversion((unsigned char*)qd->sourceText,
-								strlen(qd->sourceText), GetDatabaseEncoding(), PG_UTF8);
-					}
+					query_text = get_query_text(qd);
 					gpmon_qlog_query_text(gpmonPacket,
 							query_text,
+							NULL,
 							application_name,
 							NULL,
-							NULL);
+							NULL,
+							GPMON_QLOG_STATUS_SUBMIT);
 					gpmon_qlog_query_submit(gpmonPacket);
 					break;
 				case METRICS_QUERY_DONE:
@@ -454,6 +446,18 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 				case METRICS_QUERY_CANCELED:
 					gpmon_qlog_query_error(gpmonPacket);
 					break;
+				case METRICS_PLAN_NODE_INITIALIZE:
+					query_text = get_query_text(qd);
+					plan = get_plan(qd);
+					gpmon_qlog_query_text(gpmonPacket,
+							query_text,
+							plan,
+							application_name,
+							NULL,
+							NULL,
+							GPMON_QLOG_STATUS_START);
+					pfree(plan);
+					break;
 				default:
 					break;
 			}
@@ -510,3 +514,50 @@ _PG_init(void)
 void
 _PG_fini(void)
 {}
+
+static
+char* get_plan(QueryDesc *queryDesc)
+{
+	char *plan;
+	ExplainState *es = NewExplainState();
+
+	es->analyze = false;
+	es->verbose = true;
+	es->buffers = true;
+	es->wal = true;
+	es->timing = true;
+	es->summary = es->analyze;
+	es->format = EXPLAIN_FORMAT_JSON;
+	es->settings = true;
+
+	ExplainBeginOutput(es);
+	ExplainPrintPlan(es, queryDesc);
+	ExplainEndOutput(es);
+
+	/* Remove last line break */
+	if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
+		es->str->data[--es->str->len] = '\0';
+
+	/* Fix JSON to output an object */
+	es->str->data[0] = '{';
+	es->str->data[es->str->len - 1] = '}';
+	plan = es->str->data;
+	pfree(es);
+	return plan;
+}
+
+static
+char* get_query_text(QueryDesc *qd)
+{
+		/* convert to UTF8 which is encoding for gpperfmon database */
+		char *query_text = (char *)qd->sourceText;
+		/**
+		 * When client encoding and server encoding are different, do apply the conversion.
+		 */
+		if (GetDatabaseEncoding() != pg_get_client_encoding())
+		{
+				query_text = (char *)pg_do_encoding_conversion((unsigned char*)qd->sourceText,
+								strlen(qd->sourceText), GetDatabaseEncoding(), PG_UTF8);
+		}
+		return query_text;
+}
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index 7f3d49e539e..a9f779abc45 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -48,9 +48,11 @@ for example SCHEMA.RELATION\0
 extern void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket);
 extern void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
 		const char *queryText,
+		const char *plan,
 		const char *appName,
 		const char *resqName,
-		const char *resqPriority);
+		const char *resqPriority,
+		int status);
 extern void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket);
 extern void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket);
 extern void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket);
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 5bb1463e422..33100f151af 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -266,6 +266,9 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	if (query_info_collect_hook)
 		(*query_info_collect_hook)(METRICS_QUERY_START, queryDesc);
 
+	/**
+	 * Distribute memory to operators.
+	 */
 	if (Gp_role == GP_ROLE_DISPATCH)
 	{
 		if (!IsResManagerMemoryPolicyNone() &&

From c0e3c79daf548efd318ce987775d9c20c7415f9b Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Wed, 10 Jul 2024 16:13:30 +0800
Subject: [PATCH 10/40] Creating cloudberryui related tables in gpperfmon db

Creating cloudberryui related table in cbui schema
under gpperfmon database when creating extenion
perfmon
---
 contrib/perfmon/expected/extension_test.out | 16 +++++++
 contrib/perfmon/perfmon.sql                 | 50 +++++++++++++++++++++
 contrib/perfmon/sql/extension_test.sql      |  3 ++
 3 files changed, 69 insertions(+)

diff --git a/contrib/perfmon/expected/extension_test.out b/contrib/perfmon/expected/extension_test.out
index 0a237b65443..80ce9509534 100644
--- a/contrib/perfmon/expected/extension_test.out
+++ b/contrib/perfmon/expected/extension_test.out
@@ -14,3 +14,19 @@ select count(*) from pg_extension where extname = 'perfmon';
      1
 (1 row)
 
+set search_path to cbui;
+\d
+                             List of relations
+ Schema |               Name                |   Type   |  Owner  | Storage 
+--------+-----------------------------------+----------+---------+---------
+ cbui   | users                             | table    | gpadmin | heap
+ cbui   | users_user_id_seq                 | sequence | gpadmin | 
+ cbui   | worksheet_item_types              | table    | gpadmin | heap
+ cbui   | worksheet_item_types_type_id_seq  | sequence | gpadmin | 
+ cbui   | worksheet_items                   | table    | gpadmin | heap
+ cbui   | worksheet_items_item_id_seq       | sequence | gpadmin | 
+ cbui   | worksheet_versions                | table    | gpadmin | heap
+ cbui   | worksheet_versions_version_id_seq | sequence | gpadmin | 
+(8 rows)
+
+reset search_path;
diff --git a/contrib/perfmon/perfmon.sql b/contrib/perfmon/perfmon.sql
index 9ae3a530c0c..2885dcdfb30 100644
--- a/contrib/perfmon/perfmon.sql
+++ b/contrib/perfmon/perfmon.sql
@@ -313,3 +313,53 @@ revoke all on database gpperfmon from public;
 -- for web ui auth everyone needs connect permissions
 grant connect on database gpperfmon to public;
 -- END
+-- for web ui
+create schema cbui;
+set search_path to cbui;
+CREATE TABLE users (
+    user_id SERIAL PRIMARY KEY,
+    username VARCHAR(255) NOT NULL,
+    registration_date TIMESTAMP WITHOUT TIME ZONE NOT NULL,
+    last_login_date TIMESTAMP WITHOUT TIME ZONE
+);
+INSERT INTO users (username, registration_date, last_login_date)
+SELECT usename, CURRENT_TIMESTAMP, NULL
+FROM pg_user;
+
+CREATE TABLE worksheet_item_types (
+    type_id SERIAL PRIMARY KEY,
+    type_name VARCHAR(255) NOT NULL
+);
+
+INSERT INTO worksheet_item_types (type_id, type_name)
+VALUES
+(1, 'SQL'),
+(2, 'PYTHON');
+
+CREATE TABLE worksheet_items (
+    item_id SERIAL PRIMARY KEY,
+    parent_id INTEGER REFERENCES worksheet_items,
+    user_id INTEGER REFERENCES users,
+    type_id INTEGER REFERENCES worksheet_item_types,
+    name VARCHAR(255) NOT NULL,
+    description TEXT,
+    creation_date TIMESTAMP WITHOUT TIME ZONE NOT NULL,
+    last_modified_date TIMESTAMP WITHOUT TIME ZONE NOT NULL,
+    is_shared BOOLEAN NOT NULL,
+    shared_link VARCHAR(255)
+);
+
+CREATE TABLE worksheet_versions (
+    version_id SERIAL PRIMARY KEY,
+    version_name VARCHAR(255) NOT NULL,
+    item_id INTEGER REFERENCES worksheet_items,
+    content TEXT NOT NULL,
+    version_number INTEGER NOT NULL,
+    creation_date TIMESTAMP WITHOUT TIME ZONE NOT NULL,
+    db_name VARCHAR(255) DEFAULT 'gpadmin',
+    database_id INTEGER DEFAULT -1,
+    schema_name VARCHAR(255) DEFAULT 'public',
+    schema_id INTEGER DEFAULT -1 ,
+    uuid VARCHAR(1000) DEFAULT '-1'
+);
+RESET search_path;
diff --git a/contrib/perfmon/sql/extension_test.sql b/contrib/perfmon/sql/extension_test.sql
index d5d66d2ee51..433972c090f 100644
--- a/contrib/perfmon/sql/extension_test.sql
+++ b/contrib/perfmon/sql/extension_test.sql
@@ -2,3 +2,6 @@ create extension perfmon;
 select count(*) from pg_extension where extname = 'perfmon';
 \c gpperfmon
 select count(*) from pg_extension where extname = 'perfmon';
+set search_path to cbui;
+\d
+reset search_path;

From 007eb859325092d835e426c038aafeb2510ca415 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Sat, 3 Aug 2024 01:53:27 +0800
Subject: [PATCH 11/40] Fix compile issues of perfmon

https://code.hashdata.xyz/cloudberry/database/hashdata-lightning/-/issues/105

As some functions didn't check the result code, it
will report error on ubuntu (don't konw why not report
such warnning on other platforms)

By the way ,remove the specific apr interface director
in the makefile, it shoule be set when "./configure"
---
 contrib/perfmon/src/common/gpmonlib.c | 7 ++++++-
 contrib/perfmon/src/gpmmon/Makefile   | 2 +-
 contrib/perfmon/src/gpmmon/gpmmon.c   | 3 ++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/contrib/perfmon/src/common/gpmonlib.c b/contrib/perfmon/src/common/gpmonlib.c
index 7b23798e7af..b56ab60de9e 100644
--- a/contrib/perfmon/src/common/gpmonlib.c
+++ b/contrib/perfmon/src/common/gpmonlib.c
@@ -348,7 +348,12 @@ apr_int32_t get_query_status(apr_int32_t tmid, apr_int32_t ssid,
 		fclose(fp);
 		return GPMON_QLOG_STATUS_INVALID;
 	}
-	fscanf(fp, "%d", &status);
+	if (fscanf(fp, "%d", &status) != 1)
+	{
+		fclose(fp);
+		return GPMON_QLOG_STATUS_INVALID;
+	}
+
 	fclose(fp);
 	return status;
 }
diff --git a/contrib/perfmon/src/gpmmon/Makefile b/contrib/perfmon/src/gpmmon/Makefile
index 3501f77a391..30e072435b0 100644
--- a/contrib/perfmon/src/gpmmon/Makefile
+++ b/contrib/perfmon/src/gpmmon/Makefile
@@ -4,7 +4,7 @@ MODULE_big = gpmmon
 OBJS = gpmmon.o gpmondb.o gpmon_agg.o  ../common/gpmonlib.o
 SHLIB_LINK += -levent  -lapr-1 -laprutil-1 -lm
 PG_CFLAGS += -Wno-error=vla -Wno-vla
-PG_CPPFLAGS = -I$(libpq_srcdir) -I../include -I/usr/include/apr-1
+PG_CPPFLAGS = -I$(libpq_srcdir) -I../include
 SHLIB_LINK_INTERNAL = -Wl,-Bsymbolic -Wl,-Bstatic -Wl,-Bstatic $(libpq) -lpgcommon_shlib -Wl,-Bdynamic
 
 ifdef USE_PGXS
diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
index 0af4928a30a..790f42539db 100644
--- a/contrib/perfmon/src/gpmmon/gpmmon.c
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -1129,7 +1129,8 @@ static void gpmmon_main(void)
 				{
 					update_mmonlog_filename();
 					apr_thread_mutex_lock(logfile_mutex);
-					freopen(mmon_log_filename, "w", stdout);
+					if (!freopen(mmon_log_filename, "w", stdout))
+						gpmon_fatal(FLINE, "failed to open gpmmon log file : \"%s\"", mmon_log_filename);
 					apr_thread_mutex_unlock(logfile_mutex);
 				}
 			}

From f46955077c2594336d7767e8a2ce9f98cdc0d000 Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Mon, 19 Aug 2024 14:17:41 +0800
Subject: [PATCH 12/40] Fix:perfmon record wrong query history timestamp

When the qd node triggers the hook to send qlog messages to gpsmon, the current status
of the query is recorded in a local file. Subsequently, gpsmon forwards the qlog to gpmmon,
and gpmmon determines whether this query has been completed based on the status of the qlog
to decide whether to clear the query-related information in the memory. The reason for this
bug is that gpmmon does not judge based on the status of the qlog but directly judges through
the file that records the query status. When gpsmon has not yet forwarded the qlog information
to gpmmon, and gpmmon directly checks the status of the query in the file, it will accidentally
clear the query information in the memory.
---
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 2a85c1ca15d..f04fa768583 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -504,12 +504,13 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 
 		/* skip all entries that weren't updated recently and aren't waiting in a queue */
 		/* Read status from query text as this is reliable */
+		/* Todo Why read status from query text instead of dp?*/
 		status = get_query_status(dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt);
 
 		apr_int32_t age = newagg->generation - dp->last_updated_generation - 1;
 		if (age > 0)
 		{
-			if (status == GPMON_QLOG_STATUS_DONE &&
+			if (status == GPMON_QLOG_STATUS_DONE && dp->qlog.tfin >= dp->qlog.tstart &&
 					((dp->qlog.tfin - dp->qlog.tstart) < min_query_time ))
 			{
 				TR2(("agg_dup: skip short query %d.%d.%d generation %d, current generation %d, recorded %d\n",

From 6c8344ef8df4f239a056ce3a1b0cdd706dc0351e Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Mon, 9 Sep 2024 10:34:44 +0800
Subject: [PATCH 13/40] =?UTF-8?q?Perfmon:=20add=20cpu=E3=80=81memory?=
 =?UTF-8?q?=E3=80=81spill=20files=20metrics?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When The QE executes SQL, it will trigger hook function to send the query ID and process PID to
gpsmon. gpsmon saves the pid and queryID in memory and records the memory and CPU metrics of
this pid according to the refresh frequency. The QD node pulls data from gpsmon and aggregates
it on the QD node. Finally, metrics will record in the query_history table.
---
 contrib/perfmon/Makefile                  |   4 +-
 contrib/perfmon/gpmon_catqrynow.py        |  10 +-
 contrib/perfmon/perfmon--1.0.0--1.1.0.sql |  23 +
 contrib/perfmon/perfmon.control           |   2 +-
 contrib/perfmon/perfmon.sql               |   6 +-
 contrib/perfmon/src/gpmmon/gpmon_agg.c    | 744 +++++++++++++---------
 contrib/perfmon/src/gpmmon/gpmon_agg.h    |   9 +-
 contrib/perfmon/src/gpmmon/gpmondb.c      |  45 +-
 contrib/perfmon/src/gpmmon/gpmondb.h      |   2 +
 contrib/perfmon/src/gpmon/gpmon.c         |  64 +-
 contrib/perfmon/src/gpsmon/gpsmon.c       | 144 +++--
 contrib/perfmon/src/include/gpmon.h       |   2 +
 12 files changed, 674 insertions(+), 381 deletions(-)
 create mode 100644 contrib/perfmon/perfmon--1.0.0--1.1.0.sql

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index 6908da820e3..1226038a676 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -1,5 +1,5 @@
 NAME = perfmon
-EXTVERSION = 1.0.0
+EXTVERSION = 1.1.0
 
 REGRESS = pre_run_check guc_config query extension_test post_run
 
@@ -43,5 +43,5 @@ install:  installdirs
 	$(INSTALL_SCRIPT) gpperfmoncat.sh '$(DESTDIR)$(bindir)'
 	$(INSTALL_SCRIPT) gpmon_catqrynow.py '$(DESTDIR)$(bindir)/../sbin/'
 	$(INSTALL_SCRIPT) gpperfmon.conf '$(DESTDIR)$(datadir)'
-	$(INSTALL_SCRIPT) $(NAME)--$(EXTVERSION).sql '$(DESTDIR)$(datadir)/extension/'
+	$(INSTALL_SCRIPT) $(wildcard $(NAME)*--*.sql) '$(DESTDIR)$(datadir)/extension/'
 	$(INSTALL_SCRIPT) $(NAME).control '$(DESTDIR)$(datadir)/extension/'
diff --git a/contrib/perfmon/gpmon_catqrynow.py b/contrib/perfmon/gpmon_catqrynow.py
index 452e80a36c6..e0cfdc7a9ca 100644
--- a/contrib/perfmon/gpmon_catqrynow.py
+++ b/contrib/perfmon/gpmon_catqrynow.py
@@ -48,10 +48,10 @@
     # escape all " with ""
     if qrytxt:
         qrytxt = '""'.join(qrytxt.split('"'))
-        line[-5] = '"' + qrytxt + '"'
+        line[-9] = '"' + qrytxt + '"'
         plan = '""'.join(plan.split('"'))
-        line[-4] = '"' + plan + '"'
-        line[-3] = '"' + appname + '"'
-        line[-2] = '"' + rsqname + '"'
-        line[-1] = '"' + priority + '"'
+        line[-8] = '"' + plan + '"'
+        line[-7] = '"' + appname + '"'
+        line[-6] = '"' + rsqname + '"'
+        line[-5] = '"' + priority + '"'
     print('|'.join(line).strip())
diff --git a/contrib/perfmon/perfmon--1.0.0--1.1.0.sql b/contrib/perfmon/perfmon--1.0.0--1.1.0.sql
new file mode 100644
index 00000000000..c9527a65f66
--- /dev/null
+++ b/contrib/perfmon/perfmon--1.0.0--1.1.0.sql
@@ -0,0 +1,23 @@
+ALTER TABLE queries_history
+ADD COLUMN mem_peak BIGINT NOT NULL,
+ADD COLUMN spill_file_size BIGINT NOT NULL,
+ADD COLUMN disk_read BIGINT NOT NULL,
+ADD COLUMN disk_write BIGINT NOT NULL;
+
+ALTER FOREIGN TABLE queries_now
+ADD COLUMN mem_peak BIGINT NOT NULL,
+ADD COLUMN spill_file_size BIGINT NOT NULL,
+ADD COLUMN disk_read BIGINT NOT NULL,
+ADD COLUMN disk_write BIGINT NOT NULL;
+
+ALTER FOREIGN TABLE queries_tail
+ADD COLUMN mem_peak BIGINT NOT NULL,
+ADD COLUMN spill_file_size BIGINT NOT NULL,
+ADD COLUMN disk_read BIGINT NOT NULL,
+ADD COLUMN disk_write BIGINT NOT NULL;
+
+ALTER FOREIGN TABLE _queries_tail
+ADD COLUMN mem_peak BIGINT NOT NULL,
+ADD COLUMN spill_file_size BIGINT NOT NULL,
+ADD COLUMN disk_read BIGINT NOT NULL,
+ADD COLUMN disk_write BIGINT NOT NULL;
\ No newline at end of file
diff --git a/contrib/perfmon/perfmon.control b/contrib/perfmon/perfmon.control
index 6a2e612ce58..0bd3cd84ee2 100644
--- a/contrib/perfmon/perfmon.control
+++ b/contrib/perfmon/perfmon.control
@@ -1,5 +1,5 @@
 comment = 'data type for storing sets of (key, value) pairs'
-default_version = '1.0.0'
+default_version = '1.1.0'
 module_pathname = '$libdir/perfmon'
 relocatable = true
 trusted = true
diff --git a/contrib/perfmon/perfmon.sql b/contrib/perfmon/perfmon.sql
index 2885dcdfb30..ccfd80c21a5 100644
--- a/contrib/perfmon/perfmon.sql
+++ b/contrib/perfmon/perfmon.sql
@@ -94,7 +94,11 @@ create table public.queries_history (
        query_plan text not null default '', -- query plan (not implemented)
        application_name varchar(64), -- from 4.2 onwards
        rsqname varchar(64),          -- from 4.2 onwards
-       rqppriority varchar(16)       -- from 4.2 onwards
+       rqppriority varchar(16),    -- from 4.2 onwards
+       mem_peak  bigint not null,  -- memory peak for all processes executing query
+       spill_file_size bigint not null,   -- query spill files size
+       disk_read  bigint not null,  -- disk read for all processes executing query
+       disk_write bigint not null   -- disk write for all processes executing query
 )
 with (fillfactor=100)
 distributed by (ctime)
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index f04fa768583..82ab5234e50 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -50,15 +50,6 @@ typedef struct mmon_query_seginfo_t
 	apr_uint64_t				sum_measures_rows_out;
 } mmon_query_seginfo_t;  //The agg value at segment level for query
 
-typedef struct qdnode_t {
-	apr_int64_t last_updated_generation;
-	int recorded;
-	int num_metrics_packets;
-	gpmon_qlog_t qlog;
-	apr_hash_t* qexec_hash;
-	apr_hash_t*	query_seginfo_hash;
-} qdnode_t;
-
 struct agg_t
 {
 	apr_int64_t generation;
@@ -83,6 +74,7 @@ extern apr_queue_t* message_queue;
 extern void incremement_tail_bytes(apr_uint64_t bytes);
 static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid,
 			apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool);
+void gpdb_get_spill_file_size_from_query(qdnode_t *qdnode);
 
 /**
  * Disk space check helper function
@@ -294,43 +286,46 @@ static apr_status_t agg_put_metrics(agg_t* agg, const gpmon_metrics_t* met)
 	return 0;
 }
 
-static apr_status_t agg_put_segment(agg_t* agg, const gpmon_seginfo_t* seg)
-{
-	gpmon_seginfo_t* rec;
-
-	rec = apr_hash_get(agg->stab, &seg->dbid, sizeof(seg->dbid));
-	if (rec)
-	{
-		*rec = *seg;
-	}
-	else
-	{
-		rec = apr_palloc(agg->pool, sizeof(*rec));
-		if (!rec)
-		{
-			return APR_ENOMEM;
-		}
-		*rec = *seg;
-		apr_hash_set(agg->stab, &rec->dbid, sizeof(rec->dbid), rec);
-	}
-	return 0;
-}
+// static apr_status_t agg_put_segment(agg_t* agg, const gpmon_seginfo_t* seg)
+// {
+// 	gpmon_seginfo_t* rec;
+
+// 	rec = apr_hash_get(agg->stab, &seg->dbid, sizeof(seg->dbid));
+// 	if (rec)
+// 	{
+// 		*rec = *seg;
+// 	}
+// 	else
+// 	{
+// 		rec = apr_palloc(agg->pool, sizeof(*rec));
+// 		if (!rec)
+// 		{
+// 			return APR_ENOMEM;
+// 		}
+// 		*rec = *seg;
+// 		apr_hash_set(agg->stab, &rec->dbid, sizeof(rec->dbid), rec);
+// 	}
+// 	return 0;
+// }
 
 static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation)
 {
 	qdnode_t* node;
 
 	node = apr_hash_get(agg->qtab, &qlog->key, sizeof(qlog->key));
-	if (!node) {
-		gpmon_qlogkey_t new_key = qlog->key;
-		new_key.ccnt = 0;
-		node = apr_hash_get(agg->qtab, &new_key, sizeof(new_key));
-	}
+        if (!node) {
+                gpmon_warning(FLINE, "put query metrics can not find qdnode from qtab, queryID :%d-%d-%d", qlog->key.tmid,qlog->key.ssid,qlog->key.ccnt);
+        }
 	if (node)
 	{
 		// here update the stats for the query
 		node->qlog.cpu_elapsed += qlog->cpu_elapsed;
 		node->qlog.p_metrics.cpu_pct += qlog->p_metrics.cpu_pct;
+		node->qlog.p_metrics.fd_cnt  += qlog->p_metrics.fd_cnt;
+		if (qlog->p_metrics.mem.size > node->qlog.p_metrics.mem.size)
+		{
+                        node->qlog.p_metrics.mem.size = qlog->p_metrics.mem.size;
+		};
 		node->last_updated_generation = generation;
 		node->num_metrics_packets++;
 		TR2(("Query Metrics: (host %s ssid %d ccnt %d) (cpuelapsed %d cpupct %f) / %d\n",
@@ -343,12 +338,18 @@ static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog,
 static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 				 apr_int64_t generation)
 {
-	qdnode_t* node;
+        if (qlog->dbid == gpperfmon_dbid) {
+                TR2(("agg_put_qlog:(%d.%d.%d) ignore gpperfmon sql\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt));
+                return 0;
+        }
 
+        qdnode_t* node;
 	node = apr_hash_get(agg->qtab, &qlog->key, sizeof(qlog->key));
 	if (node) {
-		//node->qlog = *qlog;
-		merge_qlog(&node->qlog, qlog);
+		node->qlog.status = qlog->status;
+		node->qlog.tstart = qlog->tstart;
+		node->qlog.tsubmit = qlog->tsubmit;
+		node->qlog.tfin = qlog->tfin;
 		if (qlog->dbid != gpperfmon_dbid) {
 			TR2(("agg_put_qlog: found %d.%d.%d generation %d recorded %d\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, (int) generation, node->recorded));
 		}
@@ -360,7 +361,10 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 		node->qlog = *qlog;
 		node->recorded = 0;
 		node->qlog.cpu_elapsed = 0;
-		node->qlog.p_metrics.cpu_pct = 0.0;
+		node->qlog.p_metrics.cpu_pct = 0.0f;
+		node->qlog.p_metrics.fd_cnt = 0;
+		node->qlog.p_metrics.cpu_skew = 0.0f;
+		node->qlog.p_metrics.mem.size = 0;
 		node->num_metrics_packets = 0;
 
 		node->qexec_hash = apr_hash_make(agg->pool);
@@ -386,48 +390,48 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 }
 
 
-static apr_status_t agg_put_qexec(agg_t* agg, const qexec_packet_t* qexec_packet, apr_int64_t generation)
-{
-	qdnode_t* dp;
-	gpmon_qlogkey_t key;
-	mmon_qexec_t* mmon_qexec_existing = 0;
-
-	/* find qdnode of this qexec */
-	key.tmid = qexec_packet->data.key.tmid;
-	key.ssid = qexec_packet->data.key.ssid;
-	key.ccnt = qexec_packet->data.key.ccnt;
-	dp = apr_hash_get(agg->qtab, &key, sizeof(key));
-
-	if (!dp) { /* not found, internal SPI query.  Ignore. */
-		return 0;
-	}
-
-	mmon_qexec_existing = apr_hash_get(dp->qexec_hash, &qexec_packet->data.key.hash_key, sizeof(qexec_packet->data.key.hash_key));
-
-	/* if found, replace it */
-	if (mmon_qexec_existing) {
-		mmon_qexec_existing->key.ccnt = qexec_packet->data.key.ccnt;
-		mmon_qexec_existing->key.ssid = qexec_packet->data.key.ssid;
-		mmon_qexec_existing->key.tmid = qexec_packet->data.key.tmid;
-		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
-		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
-		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
-	}
-	else {
-		/* not found, make new hash entry */
-		if (! (mmon_qexec_existing = apr_palloc(agg->pool, sizeof(mmon_qexec_t))))
-			return APR_ENOMEM;		
-
-		memcpy(&mmon_qexec_existing->key, &qexec_packet->data.key, sizeof(gpmon_qexeckey_t));
-		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
-		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
-		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
-		apr_hash_set(dp->qexec_hash, &mmon_qexec_existing->key.hash_key, sizeof(mmon_qexec_existing->key.hash_key), mmon_qexec_existing);
-	}
-
-	dp->last_updated_generation = generation;
-	return 0;
-}
+// static apr_status_t agg_put_qexec(agg_t* agg, const qexec_packet_t* qexec_packet, apr_int64_t generation)
+// {
+// 	qdnode_t* dp;
+// 	gpmon_qlogkey_t key;
+// 	mmon_qexec_t* mmon_qexec_existing = 0;
+
+// 	/* find qdnode of this qexec */
+// 	key.tmid = qexec_packet->data.key.tmid;
+// 	key.ssid = qexec_packet->data.key.ssid;
+// 	key.ccnt = qexec_packet->data.key.ccnt;
+// 	dp = apr_hash_get(agg->qtab, &key, sizeof(key));
+
+// 	if (!dp) { /* not found, internal SPI query.  Ignore. */
+// 		return 0;
+// 	}
+
+// 	mmon_qexec_existing = apr_hash_get(dp->qexec_hash, &qexec_packet->data.key.hash_key, sizeof(qexec_packet->data.key.hash_key));
+
+// 	/* if found, replace it */
+// 	if (mmon_qexec_existing) {
+// 		mmon_qexec_existing->key.ccnt = qexec_packet->data.key.ccnt;
+// 		mmon_qexec_existing->key.ssid = qexec_packet->data.key.ssid;
+// 		mmon_qexec_existing->key.tmid = qexec_packet->data.key.tmid;
+// 		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
+// 		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
+// 		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
+// 	}
+// 	else {
+// 		/* not found, make new hash entry */
+// 		if (! (mmon_qexec_existing = apr_palloc(agg->pool, sizeof(mmon_qexec_t))))
+// 			return APR_ENOMEM;		
+
+// 		memcpy(&mmon_qexec_existing->key, &qexec_packet->data.key, sizeof(gpmon_qexeckey_t));
+// 		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
+// 		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
+// 		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
+// 		apr_hash_set(dp->qexec_hash, &mmon_qexec_existing->key.hash_key, sizeof(mmon_qexec_existing->key.hash_key), mmon_qexec_existing);
+// 	}
+
+// 	dp->last_updated_generation = generation;
+// 	return 0;
+// }
 
 
 apr_status_t agg_create(agg_t** retagg, apr_int64_t generation, apr_pool_t* parent_pool, apr_hash_t* fsinfotab)
@@ -510,8 +514,7 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 		apr_int32_t age = newagg->generation - dp->last_updated_generation - 1;
 		if (age > 0)
 		{
-			if (status == GPMON_QLOG_STATUS_DONE && dp->qlog.tfin >= dp->qlog.tstart &&
-					((dp->qlog.tfin - dp->qlog.tstart) < min_query_time ))
+			if (status == GPMON_QLOG_STATUS_DONE  && dp->qlog.tfin > 0 && ((dp->qlog.tfin - dp->qlog.tstart) < min_query_time ))
 			{
 				TR2(("agg_dup: skip short query %d.%d.%d generation %d, current generation %d, recorded %d\n",
 							dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt,
@@ -533,6 +536,10 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 				continue;
 			}
 		}
+                else if (dp->qlog.status == GPMON_QLOG_STATUS_DONE && status == GPMON_QLOG_STATUS_INVALID)
+                {
+                        continue;
+                }
 
 		/* check if we missed a status change */
 		if (dp->qlog.status != status)
@@ -550,28 +557,28 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 
 		*newdp = *dp;
 
-		newdp->qexec_hash = apr_hash_make(newagg->pool);
-		if (!newdp->qexec_hash) {
-			agg_destroy(newagg);
-			return APR_ENOMEM;
-		}
-
-		cnt = 0;
-		// Copy the qexec hash table
-		for (hj = apr_hash_first(newagg->pool, dp->qexec_hash); hj; hj = apr_hash_next(hj)) {
-			mmon_qexec_t* new_qexec;
-			apr_hash_this(hj, 0, 0, &vptr);
-
-			//allocate the packet
-			if (!(new_qexec = apr_pcalloc(newagg->pool, sizeof(mmon_qexec_t)))) {
-				agg_destroy(newagg);
-				return APR_ENOMEM;
-			}
-			*new_qexec = *((mmon_qexec_t*)vptr);
-
-			apr_hash_set(newdp->qexec_hash, &(new_qexec->key.hash_key), sizeof(new_qexec->key.hash_key), new_qexec);
-			TR2( ("\t    %d: (%d, %d)\n", ++cnt, new_qexec->key.hash_key.segid, new_qexec->key.hash_key.nid));
-		}
+		// newdp->qexec_hash = apr_hash_make(newagg->pool);
+		// if (!newdp->qexec_hash) {
+		// 	agg_destroy(newagg);
+		// 	return APR_ENOMEM;
+		// }
+
+		// cnt = 0;
+		// // Copy the qexec hash table
+		// for (hj = apr_hash_first(newagg->pool, dp->qexec_hash); hj; hj = apr_hash_next(hj)) {
+		// 	mmon_qexec_t* new_qexec;
+		// 	apr_hash_this(hj, 0, 0, &vptr);
+
+		// 	//allocate the packet
+		// 	if (!(new_qexec = apr_pcalloc(newagg->pool, sizeof(mmon_qexec_t)))) {
+		// 		agg_destroy(newagg);
+		// 		return APR_ENOMEM;
+		// 	}
+		// 	*new_qexec = *((mmon_qexec_t*)vptr);
+
+		// 	apr_hash_set(newdp->qexec_hash, &(new_qexec->key.hash_key), sizeof(new_qexec->key.hash_key), new_qexec);
+		// 	TR2( ("\t    %d: (%d, %d)\n", ++cnt, new_qexec->key.hash_key.segid, new_qexec->key.hash_key.nid));
+		// }
 
 		newdp->query_seginfo_hash = apr_hash_make(newagg->pool);
 		if (!newdp->query_seginfo_hash) {
@@ -595,11 +602,6 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 			TR2( ("\t    %d: (%d)\n", ++cnt, new_query_seginfo->key.segid));
 		}
 
-		// reset metrics that are accumulated each quantum
-		newdp->qlog.cpu_elapsed = 0;
-		newdp->qlog.p_metrics.cpu_pct = 0.0;
-		newdp->num_metrics_packets = 0;
-
 		apr_hash_set(newagg->qtab, &newdp->qlog.key, sizeof(newdp->qlog.key), newdp);
 	}
 
@@ -618,10 +620,13 @@ apr_status_t agg_put(agg_t* agg, const gp_smon_to_mmon_packet_t* pkt)
 		return agg_put_metrics(agg, &pkt->u.metrics);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QLOG)
 		return agg_put_qlog(agg, &pkt->u.qlog, agg->generation);
+	/*
+	hashdata-lightning not use
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QEXEC)
 		return agg_put_qexec(agg, &pkt->u.qexec_packet, agg->generation);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_SEGINFO)
 		return agg_put_segment(agg, &pkt->u.seginfo);
+	*/
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QUERY_HOST_METRICS)
 		return agg_put_query_metrics(agg, &pkt->u.qlog, agg->generation);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_FSINFO)
@@ -647,7 +652,7 @@ static apr_uint32_t write_system(agg_t* agg, const char* nowstr);
 static apr_uint32_t write_segmentinfo(agg_t* agg, char* nowstr);
 static apr_uint32_t write_dbmetrics(dbmetrics_t* dbmetrics, char* nowstr);
 static apr_uint32_t write_qlog(FILE* fp, qdnode_t *qdnode, const char* nowstr, apr_uint32_t done);
-static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nowstr);
+static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nowstr, apr_pool_t* pool);
 
 apr_status_t agg_dump(agg_t* agg)
 {
@@ -719,7 +724,7 @@ apr_status_t agg_dump(agg_t* agg)
 				TR1(("queries_tail: %p add query %d.%d.%d, status %d, generation %d, recorded %d\n",
 					 agg->qtab, qdnode->qlog.key.tmid, qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt, qdnode->qlog.status, (int) qdnode->last_updated_generation, qdnode->recorded));
 
-				temp_bytes_written += write_qlog_full(fp_queries_tail, qdnode, nowstr);
+				temp_bytes_written += write_qlog_full(fp_queries_tail, qdnode, nowstr, agg->pool);
 				incremement_tail_bytes(temp_bytes_written);
 
 				qdnode->recorded = 1;
@@ -1088,27 +1093,27 @@ static apr_uint32_t write_system(agg_t* agg, const char* nowstr)
 	return bytes_written;
 }
 
-static apr_int64_t get_rowsout(qdnode_t* qdnode)
-{
-
-	apr_hash_index_t *hi;
-	//qenode_t* pqe = NULL;
-	apr_int64_t rowsout = 0;
-	void* valptr;
-	mmon_query_seginfo_t *query_seginfo;
-
-	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
-	{
-		apr_hash_this(hi, 0, 0, &valptr);
-		query_seginfo = (mmon_query_seginfo_t*) valptr;
-		if (query_seginfo->final_rowsout != -1)
-		{
-			rowsout = query_seginfo->final_rowsout;
-			break;
-		}
-	}
-	return rowsout;
-}
+// static apr_int64_t get_rowsout(qdnode_t* qdnode)
+// {
+
+// 	apr_hash_index_t *hi;
+// 	//qenode_t* pqe = NULL;
+// 	apr_int64_t rowsout = 0;
+// 	void* valptr;
+// 	mmon_query_seginfo_t *query_seginfo;
+
+// 	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
+// 	{
+// 		apr_hash_this(hi, 0, 0, &valptr);
+// 		query_seginfo = (mmon_query_seginfo_t*) valptr;
+// 		if (query_seginfo->final_rowsout != -1)
+// 		{
+// 			rowsout = query_seginfo->final_rowsout;
+// 			break;
+// 		}
+// 	}
+// 	return rowsout;
+// }
 
 
 static void _get_sum_seg_info(apr_hash_t* segtab, apr_int64_t* total_data_out, int* segcount_out)
@@ -1227,87 +1232,87 @@ static double get_cpu_skew(qdnode_t* qdnode)
 	return coefficient_of_variation;
 }
 
-static double get_row_skew(qdnode_t* qdnode)
-{
-	apr_pool_t* tmp_pool;
-	apr_hash_t* segtab;
-	apr_hash_index_t *hi;
-
-	apr_int64_t total_row_out = 0;
-	apr_int64_t total_deviation_squared = 0;
-	double variance = 0.0f;
-	double standard_deviation = 0;
-	double coefficient_of_variation = 0;
-	apr_int64_t row_out_avg = 0;
-	apr_int64_t* seg_row_out_sum = NULL;
-	void* valptr;
-
-	int segcnt = 0;
-	int e;
-
-	if (!qdnode)
-		return 0.0f;
-
-	if (0 != (e = apr_pool_create_alloc(&tmp_pool, 0)))
-	{
-		gpmon_warningx(FLINE, e, "apr_pool_create_alloc failed");
-		return 0.0f;
-	}
-
-	segtab = apr_hash_make(tmp_pool);
-	if (!segtab)
-	{
-		gpmon_warning(FLINE, "Out of memory");
-		return 0.0f;
-	}
-
-	/* Calc rows in sum per segment */
-	TR2(("Calc rows in sum  per segment\n"));
-	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
-	{
-		mmon_query_seginfo_t	*rec;
-		apr_hash_this(hi, 0, 0, &valptr);
-		rec = (mmon_query_seginfo_t*) valptr;
-
-		if (rec->key.segid == -1)
-			continue;
-
-		seg_row_out_sum = apr_hash_get(segtab, &rec->key.segid, sizeof(rec->key.segid));
-
-		if (!seg_row_out_sum) {
-			seg_row_out_sum = apr_palloc(tmp_pool, sizeof(apr_int64_t));
-			*seg_row_out_sum = 0;
-		}
-		*seg_row_out_sum += rec->sum_measures_rows_out;
-		apr_hash_set(segtab, &rec->key.segid, sizeof(rec->key.segid), seg_row_out_sum);
-	}
-
-	_get_sum_seg_info(segtab, &total_row_out, &segcnt);
-
-	if (!segcnt) {
-		TR2(("No segments for Rows skew calculation\n"));
-		apr_pool_destroy(tmp_pool);
-		return 0.0f;
-	}
-
-	row_out_avg = total_row_out / segcnt;
-
-	TR2(("(SKEW) Avg rows out: %" FMT64 "\n", row_out_avg));
-
-	_get_sum_deviation_squared(segtab, row_out_avg, &total_deviation_squared);
-
-	variance = total_deviation_squared / (double)segcnt;
-	standard_deviation = sqrt(variance);
-
-	TR2(("(SKEW) Rows in standard deviaton: %f\n", standard_deviation));
-
-	coefficient_of_variation = row_out_avg ? standard_deviation/(double)row_out_avg : 0.0f;
-
-	apr_pool_destroy(tmp_pool);
-	TR2(("(SKEW) Rows out skew: %f\n", coefficient_of_variation));
-
-	return coefficient_of_variation;
-}
+// static double get_row_skew(qdnode_t* qdnode)
+// {
+// 	apr_pool_t* tmp_pool;
+// 	apr_hash_t* segtab;
+// 	apr_hash_index_t *hi;
+//
+// 	apr_int64_t total_row_out = 0;
+// 	apr_int64_t total_deviation_squared = 0;
+// 	double variance = 0.0f;
+// 	double standard_deviation = 0;
+// 	double coefficient_of_variation = 0;
+// 	apr_int64_t row_out_avg = 0;
+// 	apr_int64_t* seg_row_out_sum = NULL;
+// 	void* valptr;
+//
+// 	int segcnt = 0;
+// 	int e;
+//
+// 	if (!qdnode)
+// 		return 0.0f;
+//
+// 	if (0 != (e = apr_pool_create_alloc(&tmp_pool, 0)))
+// 	{
+// 		gpmon_warningx(FLINE, e, "apr_pool_create_alloc failed");
+// 		return 0.0f;
+// 	}
+//
+// 	segtab = apr_hash_make(tmp_pool);
+// 	if (!segtab)
+// 	{
+// 		gpmon_warning(FLINE, "Out of memory");
+// 		return 0.0f;
+// 	}
+//
+// 	/* Calc rows in sum per segment */
+// 	TR2(("Calc rows in sum  per segment\n"));
+// 	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
+// 	{
+// 		mmon_query_seginfo_t	*rec;
+// 		apr_hash_this(hi, 0, 0, &valptr);
+// 		rec = (mmon_query_seginfo_t*) valptr;
+//
+// 		if (rec->key.segid == -1)
+// 			continue;
+//
+// 		seg_row_out_sum = apr_hash_get(segtab, &rec->key.segid, sizeof(rec->key.segid));
+//
+// 		if (!seg_row_out_sum) {
+// 			seg_row_out_sum = apr_palloc(tmp_pool, sizeof(apr_int64_t));
+// 			*seg_row_out_sum = 0;
+// 		}
+// 		*seg_row_out_sum += rec->sum_measures_rows_out;
+// 		apr_hash_set(segtab, &rec->key.segid, sizeof(rec->key.segid), seg_row_out_sum);
+// 	}
+//
+// 	_get_sum_seg_info(segtab, &total_row_out, &segcnt);
+//
+// 	if (!segcnt) {
+// 		TR2(("No segments for Rows skew calculation\n"));
+// 		apr_pool_destroy(tmp_pool);
+// 		return 0.0f;
+// 	}
+//
+// 	row_out_avg = total_row_out / segcnt;
+//
+// 	TR2(("(SKEW) Avg rows out: %" FMT64 "\n", row_out_avg));
+//
+// 	_get_sum_deviation_squared(segtab, row_out_avg, &total_deviation_squared);
+//
+// 	variance = total_deviation_squared / (double)segcnt;
+// 	standard_deviation = sqrt(variance);
+//
+// 	TR2(("(SKEW) Rows in standard deviaton: %f\n", standard_deviation));
+//
+// 	coefficient_of_variation = row_out_avg ? standard_deviation/(double)row_out_avg : 0.0f;
+//
+// 	apr_pool_destroy(tmp_pool);
+// 	TR2(("(SKEW) Rows out skew: %f\n", coefficient_of_variation));
+//
+// 	return coefficient_of_variation;
+// }
 
 
 static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const char* nowstr, apr_uint32_t done)
@@ -1320,10 +1325,22 @@ static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const ch
 	int query_hash = 0;
 	apr_int64_t rowsout = 0;
 	float cpu_current;
+	int   fd_cnt;
 	cpu_skew = get_cpu_skew(qdnode);
-	row_skew = get_row_skew(qdnode);
-	rowsout = get_rowsout(qdnode);
-	gpmon_datetime((time_t)qdnode->qlog.tsubmit, timsubmitted);
+	qdnode->qlog.p_metrics.cpu_skew += cpu_skew;
+	//row_skew = get_row_skew(qdnode);
+	//rowsout = get_rowsout(qdnode);
+        // get spill file size
+        gpdb_get_spill_file_size_from_query(qdnode);
+
+	if (qdnode->qlog.tsubmit)
+	{
+		gpmon_datetime((time_t)qdnode->qlog.tsubmit, timsubmitted);
+	}
+	else
+	{
+		snprintf(timsubmitted, GPMON_DATE_BUF_SIZE, "null");
+	}
 
 	if (qdnode->qlog.tstart)
 	{
@@ -1334,26 +1351,31 @@ static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const ch
 		snprintf(timstarted, GPMON_DATE_BUF_SIZE, "null");
 	}
 
-	if (done)
+	if (done && qdnode->qlog.tfin)
 	{
-		cpu_current = 0.0f;
 		gpmon_datetime((time_t)qdnode->qlog.tfin, timfinished);
 	}
 	else
 	{
-		if (qdnode->num_metrics_packets)
-		{
-			// average cpu_pct per reporting machine
-			cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
-		}
-		else
-		{
-			cpu_current = 0.0f;
-		}
 		snprintf(timfinished, GPMON_DATE_BUF_SIZE,  "null");
 	}
 
-	snprintf(line, line_size, "%s|%d|%d|%d|%d|%s|%u|%d|%s|%s|%s|%s|%" FMT64 "|%" FMT64 "|%.4f|%.2f|%.2f|%d",
+
+	if (qdnode->num_metrics_packets)
+	{
+		// average cpu_pct per reporting machine
+		cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
+		fd_cnt = qdnode->qlog.p_metrics.fd_cnt / qdnode->num_metrics_packets;
+		cpu_skew = qdnode->qlog.p_metrics.cpu_skew / qdnode->num_metrics_packets;
+	}
+	else
+	{
+		cpu_current = 0.0f;
+		fd_cnt  = 0;
+		cpu_skew = 0.0f;
+	}
+
+	snprintf(line, line_size, "%s|%d|%d|%d|%d|%s|%u|%d|%s|%s|%s|%s|%" FMT64 "|%" FMT64 "|%.4f|%.2f|%.2f|%d||||||%" FMTU64 "|%" FMTU64 "|%d|%d",
 		nowstr,
 		qdnode->qlog.key.tmid,
 		qdnode->qlog.key.ssid,
@@ -1371,7 +1393,12 @@ static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const ch
 		cpu_current,
 		cpu_skew,
 		row_skew,
-		query_hash);
+		query_hash,
+                qdnode->qlog.p_metrics.mem.size,
+                qdnode->qlog.p_metrics.spill_files_size,
+                0,
+                0
+                );
 }
 
 
@@ -1391,13 +1418,12 @@ static apr_uint32_t write_qlog(FILE* fp, qdnode_t *qdnode, const char* nowstr, a
 	}
 	else
 	{
-		/* Query text "joined" by python script */
-		fprintf(fp, "%s|||||\n", line);
+		fprintf(fp, "%s\n", line);
 		return bytes_written;
 	}
 }
 
-static int get_and_print_next_query_file_kvp(FILE* outfd, FILE* queryfd, char* qfname, apr_uint32_t* bytes_written)
+static int get_query_file_next_kvp(FILE* queryfd, char* qfname, char** str, apr_pool_t* pool, apr_uint32_t* bytes_written)
 {
     const int line_size = 1024;
     char line[line_size];
@@ -1435,103 +1461,205 @@ static int get_and_print_next_query_file_kvp(FILE* outfd, FILE* queryfd, char* q
             return APR_NOTFOUND;
     }
 
-    fprintf(outfd, "\"");
-    (*bytes_written)++;
-
-    while (field_len > 0) {
-	    int max, n;
-        char* q;
-        max = field_len > sizeof(line) ? sizeof(line) : field_len;
-        n = fread(line, 1, max, queryfd);
-        for (p = line, q = line + n; p < q; p++)
-        {
-            if (*p == '"')
-            {
-                fputc('\"', outfd);
-                (*bytes_written)++;
-            }
-
-            fputc(*p, outfd);
-
-            (*bytes_written)++;
-
-        }
-        field_len -= n;
-        if (n < max) break;
+    *str = apr_palloc(pool,(field_len + 1) * sizeof(char));
+    memset(*str, 0, field_len+1);
+    (*str)[field_len] = '\0';
+    int n = fread(*str, 1, field_len, queryfd);
+    if (n!= field_len)
+    {
+            gpmon_warning(FLINE, "missing expected bytes in file: %s", qfname);
+            return APR_NOTFOUND;
     }
 
-	fprintf(outfd, "\"");
-	(*bytes_written)++;
-
-    int n = fread(line, 1, 1, queryfd);
+    n = fread(line, 1, 1, queryfd);
     if (n != 1)
     {
 	    gpmon_warning(FLINE, "missing expected newline in file: %s", qfname);
         return APR_NOTFOUND;
     }
 
+    *bytes_written += field_len;
+
     return APR_SUCCESS;
 }
 
-static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nowstr)
+static  apr_uint32_t  get_query_info(FILE* qfptr, char qfname[], char* array[], apr_pool_t* pool)
 {
-	const int line_size = 1024;
-	const int qfname_size = 256;
-    char line[line_size];
-    char qfname[qfname_size];
-    FILE* qfptr = 0;
-    apr_uint32_t bytes_written = 0;
+        // 0 add query text
+        // 1 add query plan
+        // 2 add application name
+        // 3 add rsqname
+        // 4 add priority
+        int total_iterations = 5;
+        int all_good = 1;
+        int iter;
+        apr_uint32_t bytes_written = 0;
+        int retCode = APR_SUCCESS;
+        for (iter = 0; iter < total_iterations; ++iter)
+        {
+                if (!all_good){
+                        // we have no data for query plan
+                        // if we failed once already don't bother trying to parse query file
+                        continue;
+                }
+
+                retCode = get_query_file_next_kvp(qfptr, qfname, &array[iter], pool, &bytes_written);
+                if (retCode != APR_SUCCESS)
+                        all_good = 0;
+        }
 
-    fmt_qlog(line, line_size, qdnode, nowstr, 1);
-    bytes_written = strlen(line) + 1;
-	if (bytes_written == line_size)
-	{
-		gpmon_warning(FLINE, "qlog line too long ... ignored: %s", line);
-		return 0;
-	}
+        fclose(qfptr);
+        return bytes_written;
+}
 
-	fprintf(fp, "%s", line);
+static char* replaceQuotes(char *str, apr_pool_t* pool, int* size) {
+    int len = strlen(str);
+    int newLen = len;
+    int quoteCount = 0;
 
-	snprintf(qfname, qfname_size, GPMON_DIR "q%d-%d-%d.txt", qdnode->qlog.key.tmid,
-            qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt);
+    // count the number of quotes
+    for (int i = 0; i < len; i++) {
+        if (str[i] == '"') {
+            quoteCount++;
+        }
+    }
 
-	qfptr = fopen(qfname, "r");
-    if (!qfptr)
-    {
-	    fprintf(fp, "|||||\n");
-	    bytes_written += 6;
-	    return bytes_written;
+    *size += quoteCount;
+
+    newLen += quoteCount;
+    char* newStr = apr_palloc(pool,(newLen + 1) * sizeof(char));
+    int j = 0;
+    for (int i = 0; i < len; i++) {
+        if (str[i] == '"') {
+            newStr[j++] = '"';
+            newStr[j++] = '"';
+        } else {
+            newStr[j++] = str[i];
+        }
     }
+    newStr[j] = '\0';
+    return newStr;
+}
 
-    // 0 add query text
-    // 1 add query plan
-    // 2 add application name
-    // 3 add rsqname
-    // 4 add priority
-
-    int total_iterations = 5;
-    int all_good = 1;
-    int iter;
-    int retCode = APR_SUCCESS;
-    for (iter = 0; iter < total_iterations; ++iter)
-    {
-	    fprintf(fp, "|");
-        bytes_written++;
+static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nowstr, apr_pool_t* pool)
+{
+        char timsubmitted[GPMON_DATE_BUF_SIZE];
+        char timstarted[GPMON_DATE_BUF_SIZE];
+        char timfinished[GPMON_DATE_BUF_SIZE];
+        double cpu_skew = 0.0f;
+        double row_skew = 0.0f;
+        int query_hash = 0;
+        apr_int64_t rowsout = 0;
+        float cpu_current;
+        int   fd_cnt;
+        cpu_skew = get_cpu_skew(qdnode);
+        qdnode->qlog.p_metrics.cpu_skew += cpu_skew;
+        //row_skew = get_row_skew(qdnode);
+        //rowsout = get_rowsout(qdnode);
+
+        // get spill file size
+        gpdb_get_spill_file_size_from_query(qdnode);
+
+        if (qdnode->qlog.tsubmit)
+        {
+                gpmon_datetime((time_t)qdnode->qlog.tsubmit, timsubmitted);
+        }
+        else
+        {
+                snprintf(timsubmitted, GPMON_DATE_BUF_SIZE, "null");
+        }
 
-        if (!all_good){
-            // we have no data for query plan
-            // if we failed once already don't bother trying to parse query file
-            continue;
+        if (qdnode->qlog.tstart)
+        {
+                gpmon_datetime((time_t)qdnode->qlog.tstart, timstarted);
+        }
+        else
+        {
+                snprintf(timstarted, GPMON_DATE_BUF_SIZE, "null");
         }
 
-        retCode = get_and_print_next_query_file_kvp(fp, qfptr, qfname, &bytes_written);
-        if (retCode != APR_SUCCESS)
-            all_good = 0;
-    }
+        if (qdnode->qlog.tfin)
+        {
+                gpmon_datetime((time_t)qdnode->qlog.tfin, timfinished);
+        }
+        else
+        {
+                snprintf(timfinished, GPMON_DATE_BUF_SIZE,  "null");
+        }
 
-    fprintf(fp, "\n");
-    fclose(qfptr);
-	return bytes_written;
+
+        if (qdnode->num_metrics_packets)
+        {
+                // average cpu_pct per reporting machine
+                cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
+                fd_cnt = qdnode->qlog.p_metrics.fd_cnt / qdnode->num_metrics_packets;
+                cpu_skew = qdnode->qlog.p_metrics.cpu_skew / qdnode->num_metrics_packets;
+        }
+        else
+        {
+                cpu_current = 0.0f;
+                fd_cnt  = 0;
+                cpu_skew = 0.0f;
+        }
+
+
+        // get query text、plan
+        char* array[5] = {"", "", "", "", ""};
+        const int qfname_size = 256;
+        char qfname[qfname_size];
+        int size = 0;
+        FILE* qfptr = 0;
+        snprintf(qfname, qfname_size, GPMON_DIR "q%d-%d-%d.txt", qdnode->qlog.key.tmid,
+                 qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt);
+        qfptr = fopen(qfname, "r");
+        if (qfptr)
+        {
+                // array[0] query text
+                // array[1] query plan
+                // array[2] application name
+                // array[3] add rsqname
+                // array[4] add priority
+                size = get_query_info(qfptr, qfname, array, pool);
+                array[0] = replaceQuotes(array[0], pool, &size);
+                array[1] = replaceQuotes(array[1], pool, &size);
+        }
+
+        int line_size = (1024+size)*sizeof(char);
+        char* line = apr_palloc(pool,line_size);
+        memset(line,0,line_size);
+        snprintf(line, line_size, "%s|%d|%d|%d|%d|%s|%u|%d|%s|%s|%s|%s|%" FMT64 "|%" FMT64 "|%.4f|%.2f|%.2f|%d|\"%s\"|\"%s\"|\"%s\"|\"%s\"|\"%s\"|%" FMTU64 "|%" FMTU64 "|%d|%d",
+                nowstr,
+                qdnode->qlog.key.tmid,
+                qdnode->qlog.key.ssid,
+                qdnode->qlog.key.ccnt,
+                qdnode->qlog.pid,
+                qdnode->qlog.user,
+                qdnode->qlog.dbid,
+                qdnode->qlog.cost,
+                timsubmitted,
+                timstarted,
+                timfinished,
+                gpmon_qlog_status_string(qdnode->qlog.status),
+                rowsout,
+                qdnode->qlog.cpu_elapsed,
+                cpu_current,
+                cpu_skew,
+                row_skew,
+                query_hash,
+                array[0],
+                array[1],
+                array[2],
+                array[3],
+                array[4],
+                qdnode->qlog.p_metrics.mem.size,
+                qdnode->qlog.p_metrics.spill_files_size,
+                0,
+                0
+        );
+
+        fprintf(fp, "%s\n", line);
+        apr_uint32_t bytes_written = strlen(line) + 1;
+        return bytes_written;
 }
 
 static void bloom_init(bloom_t* bloom)
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.h b/contrib/perfmon/src/gpmmon/gpmon_agg.h
index 2267a5e2790..1da8d5b6318 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.h
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.h
@@ -10,5 +10,12 @@ apr_status_t agg_dup(agg_t** agg, agg_t* oldagg, apr_pool_t* pool, apr_hash_t* f
 void agg_destroy(agg_t* agg);
 apr_status_t agg_put(agg_t* agg, const gp_smon_to_mmon_packet_t* pkt);
 apr_status_t agg_dump(agg_t* agg);
-
+typedef struct qdnode_t {
+        apr_int64_t last_updated_generation;
+        int recorded;
+        int num_metrics_packets;
+        gpmon_qlog_t qlog;
+        apr_hash_t* qexec_hash;
+        apr_hash_t*	query_seginfo_hash;
+} qdnode_t;
 #endif
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.c b/contrib/perfmon/src/gpmmon/gpmondb.c
index 746808dcb7e..8709019d846 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.c
+++ b/contrib/perfmon/src/gpmmon/gpmondb.c
@@ -8,6 +8,7 @@
 #include "apr_strings.h"
 #include "apr_file_io.h"
 #include "time.h"
+#include "gpmon_agg.h"
 
 int gpdb_exec_search_for_at_least_one_row(const char*, PGconn*);
 apr_status_t empty_harvest_file(const char*, apr_pool_t*, PGconn*);
@@ -233,7 +234,7 @@ Oid gpdb_gpperfmon_dbid(void)
 		rowcount = PQntuples(result);
 		if (rowcount > 0)
 		{
-			dbid = DatumGetObjectId(CStringGetDatum(PQgetvalue(result, 0, 0)));
+                        sscanf(PQgetvalue(result, 0, 0), "%u", &dbid);
 		}
 	}
 	PQclear(result);
@@ -717,6 +718,48 @@ void gpdb_get_single_string_from_query(const char* QUERY, char** resultstring, a
 	*resultstring = tmpoutput;
 }
 
+void gpdb_get_spill_file_size_from_query(qdnode_t *qdnode)
+{
+        char query[100];
+        snprintf(query, sizeof(query), "select sum(size) from gp_toolkit.gp_workfile_usage_per_query where sess_id=%d And command_cnt=%d;",
+                 qdnode->qlog.key.ssid,qdnode->qlog.key.ccnt);
+
+        PGconn* conn = 0;
+        PGresult* result = 0;
+        char* tmpoutput = 0;
+        int rowcount;
+        const char* errmsg = gpdb_exec(&conn, &result, query);
+        if (errmsg)
+        {
+            gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, query);
+        }
+        else
+        {
+            rowcount = PQntuples(result);
+            if (rowcount == 1)
+            {
+                tmpoutput = PQgetvalue(result, 0, 0);
+            }
+            else if (rowcount > 1)
+            {
+                gpmon_warning(FLINE, "unexpected number of rows returned from query %s", query);
+            }
+        }
+
+        PQclear(result);
+        PQfinish(conn);
+
+        if (tmpoutput)
+        {
+                uint64_t temp_result;
+                sscanf(tmpoutput, "%lu", &temp_result);
+                if (temp_result > 0)
+                {
+                        qdnode->qlog.p_metrics.spill_files_size = temp_result;
+                }
+        }
+}
+
 
 static void check_and_add_partition(PGconn* conn, const char* tbl, int begin_year, int begin_month, int end_year, int end_month)
 {
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.h b/contrib/perfmon/src/gpmmon/gpmondb.h
index ffd732a8fef..75a48b8a38c 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.h
+++ b/contrib/perfmon/src/gpmmon/gpmondb.h
@@ -5,6 +5,7 @@
 #include "apr_md5.h"
 #include "apr_hash.h"
 #include "gpmonlib.h"
+#include "gpmon_agg.h"
 
 /**
  * Validate the the gpperfmon database is correct and
@@ -91,6 +92,7 @@ APR_DECLARE (void) create_log_alert_table(void);
 int find_token_in_config_string(char*, char**, const char*);
 void process_line_in_hadoop_cluster_info(apr_pool_t*, apr_hash_t*, char*, char*, char*);
 int get_hadoop_hosts_and_add_to_hosts(apr_pool_t*, apr_hash_t*, mmon_options_t*);
+void gpdb_get_spill_file_size_from_query(qdnode_t* qdnode);
 apr_status_t truncate_file(char*, apr_pool_t*);
 
 #endif /* GPMONDB_H */
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 840b864588c..244bd172c98 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -53,6 +53,8 @@ static gpmon_packet_t* gpmon_qlog_packet_init();
 static void init_gpmon_hooks(void);
 static char* get_plan(QueryDesc *queryDesc);
 static char* get_query_text(QueryDesc *queryDesc);
+static int32 tstart = 0;
+static int32 tsubmit = 0;
 
 struct  {
     int    gxsock;
@@ -243,6 +245,34 @@ gpmon_qlog_packet_init()
 	return gpmonPacket;
 }
 
+
+/**
+ * Create and init a qexec packet
+ *
+ * It is called by gpmon_query_info_collect_hook each time
+ */
+static gpmon_packet_t*
+gpmon_qexec_packet_init()
+{
+	gpmon_packet_t *gpmonPacket = NULL;
+	gpmonPacket = (gpmon_packet_t *) palloc(sizeof(gpmon_packet_t));
+	memset(gpmonPacket, 0, sizeof(gpmon_packet_t));
+
+	Assert(perfmon_enabled && Gp_role == GP_ROLE_EXECUTE);
+	Assert(gpmonPacket);
+	
+	gpmonPacket->magic = GPMON_MAGIC;
+	gpmonPacket->version = GPMON_PACKET_VERSION;
+	gpmonPacket->pkttype = GPMON_PKTTYPE_QEXEC;
+
+	gpmon_gettmid(&gpmonPacket->u.qexec.key.tmid);
+	gpmonPacket->u.qexec.key.ssid = gp_session_id;
+	gpmonPacket->u.qexec.key.ccnt = gp_command_count;
+	gpmonPacket->u.qexec.key.hash_key.segid = GpIdentity.segindex;
+	gpmonPacket->u.qexec.key.hash_key.pid = MyProcPid;
+	return gpmonPacket;
+}
+
 /**
  * Call this method when query is submitted.
  */
@@ -254,9 +284,9 @@ void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket)
 	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
 
 	gettimeofday(&tv, 0);
-	
+	tsubmit = tv.tv_sec;
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_SUBMIT;
-	gpmonPacket->u.qlog.tsubmit = tv.tv_sec;
+	gpmonPacket->u.qlog.tsubmit = tsubmit;
 	
 	gpmon_send(gpmonPacket);
 }
@@ -335,9 +365,11 @@ void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket)
 	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
 
 	gettimeofday(&tv, 0);
+	tstart = tv.tv_sec;
 	
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_START;
-	gpmonPacket->u.qlog.tstart = tv.tv_sec;
+	gpmonPacket->u.qlog.tsubmit = tsubmit;
+	gpmonPacket->u.qlog.tstart = tstart;
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
 			gpmonPacket->u.qlog.key.ssid,
 			gpmonPacket->u.qlog.key.ccnt,
@@ -356,6 +388,8 @@ void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket)
 	gettimeofday(&tv, 0);
 	
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_DONE;
+	gpmonPacket->u.qlog.tsubmit = tsubmit;
+	gpmonPacket->u.qlog.tstart = tstart;
 	gpmonPacket->u.qlog.tfin = tv.tv_sec;
 	
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
@@ -378,6 +412,8 @@ void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket)
 	gettimeofday(&tv, 0);
 	
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_ERROR;
+	gpmonPacket->u.qlog.tsubmit = tsubmit;
+	gpmonPacket->u.qlog.tstart = tstart;
 	gpmonPacket->u.qlog.tfin = tv.tv_sec;
 	
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
@@ -397,6 +433,8 @@ gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket)
 {
 	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_CANCELING;
+	gpmonPacket->u.qlog.tsubmit = tsubmit;
+	gpmonPacket->u.qlog.tstart = tstart;
 	
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
 			gpmonPacket->u.qlog.key.ssid,
@@ -412,11 +450,12 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 	char *query_text;
 	char *plan;
 	QueryDesc *qd = (QueryDesc *)queryDesc;
-	if (perfmon_enabled
-			&& Gp_role == GP_ROLE_DISPATCH && qd != NULL)
+	if (perfmon_enabled && qd != NULL)
 	{
 		gpmon_packet_t *gpmonPacket = NULL;
 		PG_TRY();
+		{
+		if (Gp_role == GP_ROLE_DISPATCH)
 		{
 			gpmonPacket = gpmon_qlog_packet_init();
 			switch (status)
@@ -463,6 +502,21 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 			}
 			pfree(gpmonPacket);
 		}
+		else if (Gp_role == GP_ROLE_EXECUTE)
+		{
+                        gpmonPacket = gpmon_qexec_packet_init();
+                        switch (status)
+                        {
+                                case METRICS_QUERY_START:
+                                case METRICS_PLAN_NODE_EXECUTING:
+                                        gpmon_send(gpmonPacket);
+                                        break;
+                                default:
+                                        break;
+                        }
+                        pfree(gpmonPacket);
+		}
+		}
 		PG_CATCH();
 		{
 			EmitErrorReport();
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index a478267e5d0..5c5e0e63822 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -96,9 +96,9 @@ struct gx_t
 	const char* hostname; /* my hostname */
 
 	/* hash tables */
-	apr_hash_t* qexectab; /* stores qexec packets */
+	apr_hash_t* qexectab; /* stores qexec packets hashdata-lightning not use*/
 	apr_hash_t* qlogtab; /* stores qlog packets */
-	apr_hash_t* segmenttab; /* stores segment packets */
+	apr_hash_t* segmenttab; /* stores segment packets hashdata-lightning not use*/
 	apr_hash_t* pidtab; /* key=pid, value=pidrec_t */
 	apr_hash_t* querysegtab; /* stores gpmon_query_seginfo_t */
 };
@@ -161,7 +161,7 @@ void update_log_filename()
 
 static void gx_accept(SOCKET sock, short event, void* arg);
 static void gx_recvfrom(SOCKET sock, short event, void* arg);
-static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt);
+//static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt);
 
 /**
  * helper function to copy the union packet from a gpmon_packet_t to a gp_smon_to_mmon_packet_t
@@ -255,6 +255,7 @@ static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid,
 	apr_int32_t status;
 	sigar_proc_cpu_t cpu;
 	sigar_proc_mem_t mem;
+	sigar_mem_t system_mem;
 	sigar_proc_fd_t fd;
 	pidrec_t* rec;
 	apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
@@ -325,6 +326,16 @@ static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid,
 		return;
 	}
 
+	status = sigar_mem_get(gx.sigar,&system_mem);
+	if (status != SIGAR_OK)
+	{
+		if (status != ESRCH)
+                {
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+		}
+		return;
+	}
+
 	rec->updated_tick = gx.tick;
 	rec->p_metrics.fd_cnt = (apr_uint32_t) fd.total;
 	rec->p_metrics.cpu_pct = (float) (cpu.percent * cpu_cores_utilization_multiplier);
@@ -715,7 +726,9 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		pidrec_t* pidrec;
 		int count = 0;
 		apr_hash_t* query_cpu_table = NULL;
+		sigar_proc_state_t state;
 
+		/*
 		for (hi = apr_hash_first(0, segtab); hi; hi = apr_hash_next(hi))
 		{
  			void* vptr;
@@ -724,14 +737,14 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			if (ppkt->header.pkttype != GPMON_PKTTYPE_SEGINFO)
 				continue;
 
-			/* fill in hostname */
+			// fill in hostname 
 			strncpy(ppkt->u.seginfo.hostname, gx.hostname, sizeof(ppkt->u.seginfo.hostname) - 1);
 			ppkt->u.seginfo.hostname[sizeof(ppkt->u.seginfo.hostname) - 1] = 0;
 
 			TR2(("sending magic %x, pkttype %d\n", ppkt->header.magic, ppkt->header.pkttype));
 			send_smon_to_mon_pkt(sock, ppkt);
 			count++;
-		}
+		}*/
 
 		for (hi = apr_hash_first(0, qdtab); hi; hi = apr_hash_next(hi))
 		{
@@ -762,23 +775,24 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			count++;
 		}
 
+		/*
 		for (hi = apr_hash_first(0, qetab); hi; hi = apr_hash_next(hi))
 		{
 			gpmon_qexec_t* qexec;
 			void *vptr;
 
 			apr_hash_this(hi, 0, 0, &vptr);
-            qexec = vptr;
-            /* fill in _p_metrics */
-            pidrec = apr_hash_get(pidtab, &qexec->key.hash_key.pid, sizeof(qexec->key.hash_key.pid));
-            if (pidrec) {
-                qexec->_p_metrics = pidrec->p_metrics;
-                qexec->_cpu_elapsed = pidrec->cpu_elapsed;
-            } else {
-                memset(&qexec->_p_metrics, 0, sizeof(qexec->_p_metrics));
-            }
-
-			/* fill in _hname */
+                        qexec = vptr;
+                        // fill in _p_metrics
+                        pidrec = apr_hash_get(pidtab, &qexec->key.hash_key.pid, sizeof(qexec->key.hash_key.pid));
+                        if (pidrec) {
+                        qexec->_p_metrics = pidrec->p_metrics;
+                        qexec->_cpu_elapsed = pidrec->cpu_elapsed;
+                        } else {
+                        memset(&qexec->_p_metrics, 0, sizeof(qexec->_p_metrics));
+                        }
+
+			// fill in _hname 
 			strncpy(qexec->_hname, gx.hostname, sizeof(qexec->_hname) - 1);
 			qexec->_hname[sizeof(qexec->_hname) - 1] = 0;
 
@@ -786,12 +800,13 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 				break;
 			}
 
-			TR2(("sending qexec, pkttype %d\n", localPacketObject.header.pkttype));
-			send_smon_to_mon_pkt(sock, &localPacketObject);
-			count++;
+                        TR2(("sending qexec, pkttype %d\n", localPacketObject.header.pkttype));
+                        send_smon_to_mon_pkt(sock, &localPacketObject);
+                        count++;
 		}
+                */
 
-		// calculate CPU utilization per query for this machine
+		// calculate CPU utilization And Memory utilization per query for this machine
 		query_cpu_table = apr_hash_make(oldpool);
 		CHECKMEM(query_cpu_table);
 
@@ -817,6 +832,10 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 
 				lookup->cpu_elapsed += pidrec->cpu_elapsed;
 				lookup->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
+				lookup->p_metrics.fd_cnt += lookup->p_metrics.fd_cnt;
+				lookup->p_metrics.mem.resident += lookup->p_metrics.mem.resident;
+				lookup->p_metrics.mem.size += lookup->p_metrics.mem.size;
+				lookup->p_metrics.mem.share += lookup->p_metrics.mem.share;
 			}
 			else
 			{
@@ -824,6 +843,12 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 				apr_hash_set(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key), pidrec);
 			}
 
+			//add to new pidtab if process is exist
+			int status = sigar_proc_state_get(gx.sigar,pidrec->pid, &state);
+                        if (status == SIGAR_OK)
+                        {
+                                apr_hash_set(gx.pidtab, &pidrec->pid, sizeof(pidrec->pid), pidrec);
+    		        }
 		}
 
 		// reset packet to 0
@@ -847,6 +872,9 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			ppkt->u.qlog.key.ccnt = pidrec->query_key.ccnt;
 			ppkt->u.qlog.cpu_elapsed = pidrec->cpu_elapsed;
 			ppkt->u.qlog.p_metrics.cpu_pct = pidrec->p_metrics.cpu_pct;
+			ppkt->u.qlog.p_metrics.fd_cnt = pidrec->p_metrics.fd_cnt;
+			ppkt->u.qlog.p_metrics.mem = pidrec->p_metrics.mem;
+			ppkt->u.qlog.pid = pidrec->pid;
 
 			TR2(("SEND tmid %d ssid %d ccnt %d (CPU elapsed %ld CPU Percent %.2f)\n",
 				ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt,
@@ -979,8 +1007,7 @@ static void gx_recvqlog(gpmon_packet_t* pkt)
 	rec = apr_hash_get(gx.qlogtab, &p->key, sizeof(p->key));
 	if (rec)
 	{
-		//memcpy(&rec->u.qlog, p, sizeof(*p));
-		merge_qlog(&rec->u.qlog, p);
+		memcpy(&rec->u.qlog, p, sizeof(*p));
 	}
 	else
 	{
@@ -1017,17 +1044,17 @@ static void gx_recvsegment(gpmon_packet_t* pkt)
 * write the qexec packet.
 * @return 1 if success, 0 if failure
 */
-static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt)
-{
-	// Copy over needed values
-	memcpy(&pkt->u.qexec_packet.data.key, &qexec->key, sizeof(gpmon_qexeckey_t));
-	pkt->u.qexec_packet.data.measures_rows_in = qexec->rowsout;
-	pkt->u.qexec_packet.data._cpu_elapsed = qexec->_cpu_elapsed;
-	pkt->u.qexec_packet.data.rowsout = qexec->rowsout;
-
-	gp_smon_to_mmon_set_header(pkt,GPMON_PKTTYPE_QEXEC);
-	return 1;
-}
+// static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt)
+// {
+// 	// Copy over needed values
+// 	memcpy(&pkt->u.qexec_packet.data.key, &qexec->key, sizeof(gpmon_qexeckey_t));
+// 	pkt->u.qexec_packet.data.measures_rows_in = qexec->rowsout;
+// 	pkt->u.qexec_packet.data._cpu_elapsed = qexec->_cpu_elapsed;
+// 	pkt->u.qexec_packet.data.rowsout = qexec->rowsout;
+
+// 	gp_smon_to_mmon_set_header(pkt,GPMON_PKTTYPE_QEXEC);
+// 	return 1;
+// }
 
 static void extract_segments_exec(gpmon_packet_t* pkt)
 {
@@ -1052,11 +1079,11 @@ static void extract_segments_exec(gpmon_packet_t* pkt)
 	if (rec)
 	{
 		rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
-		rec->u.queryseg.sum_measures_rows_out += p->rowsout;
-		if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1 && (int64)(p->rowsout) > rec->u.queryseg.final_rowsout)
-		{
-			rec->u.queryseg.final_rowsout = p->rowsout;
-		}
+		// rec->u.queryseg.sum_measures_rows_out += p->rowsout;
+		// if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1 && (int64)(p->rowsout) > rec->u.queryseg.final_rowsout)
+		// {
+		// 	rec->u.queryseg.final_rowsout = p->rowsout;
+		// }
 	}
 	else
 	{
@@ -1065,16 +1092,16 @@ static void extract_segments_exec(gpmon_packet_t* pkt)
 		CHECKMEM(rec);
 		gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
 		rec->u.queryseg.key = qseg_key;
-		if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1)
-		{
-			rec->u.queryseg.final_rowsout = p->rowsout;
-		}
-		else
-		{
-			rec->u.queryseg.final_rowsout = -1;
-		}
+		// if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1)
+		// {
+		// 	rec->u.queryseg.final_rowsout = p->rowsout;
+		// }
+		// else
+		// {
+		// 	rec->u.queryseg.final_rowsout = -1;
+		// }
 		rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
-		rec->u.queryseg.sum_measures_rows_out = p->rowsout;
+		//rec->u.queryseg.sum_measures_rows_out = p->rowsout;
 		apr_hash_set(gx.querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
 	}
 }
@@ -1608,17 +1635,20 @@ void gx_main(int port, apr_int64_t signature)
 			gpsmon_fatalx(FLINE, APR_FROM_OS_ERROR(errno), "event_dispatch failed");
 		}
 
-		/* get pid metrics */
-		for (hi = apr_hash_first(0, gx.qexectab); hi; hi = apr_hash_next(hi))
+		/* refresh pid metrics */
+		for (hi = apr_hash_first(0, gx.pidtab); hi; hi = apr_hash_next(hi))
 		{
-            void* vptr;
-            gpmon_qexec_t* rec;
-            apr_hash_this(hi, 0, 0, &vptr);
-            rec = vptr;
-            get_pid_metrics(rec->key.hash_key.pid,
-                    rec->key.tmid,
-                    rec->key.ssid,
-                    rec->key.ccnt);
+			void* vptr;
+			pidrec_t* rec;
+                        apr_hash_this(hi, 0, 0, &vptr);
+                        rec = vptr;
+			if (rec)
+                        {
+				get_pid_metrics(rec->pid,
+                                rec->query_key.tmid,
+                                rec->query_key.ssid,
+                                rec->query_key.ccnt);
+			}
 		}
 
 		/* check log size */
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index a9f779abc45..4fe9fb87df1 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -153,6 +153,8 @@ typedef struct gpmon_proc_metrics_t gpmon_proc_metrics_t;
 struct gpmon_proc_metrics_t {
     uint32 fd_cnt;		/* # opened files / sockets etc */
     float        cpu_pct;	/* cpu usage % */
+    double       cpu_skew;
+    uint64       spill_files_size;
     struct {
 		uint64 size, resident, share;
     } mem;

From 21927aa56258b78c6c953206d82fc759eacf60ae Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Wed, 18 Sep 2024 14:19:48 +0800
Subject: [PATCH 14/40] Perfmon: fix coredump error

Fix the gpsmon crashing due to the incorrect use of the memory pool.
---
 contrib/perfmon/src/gpmmon/gpmon_agg.c |   1 +
 contrib/perfmon/src/gpmmon/gpmondb.c   |   6 +-
 contrib/perfmon/src/gpsmon/gpsmon.c    | 303 +++++++++----------------
 3 files changed, 105 insertions(+), 205 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 82ab5234e50..59d0a6402f8 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -365,6 +365,7 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 		node->qlog.p_metrics.fd_cnt = 0;
 		node->qlog.p_metrics.cpu_skew = 0.0f;
 		node->qlog.p_metrics.mem.size = 0;
+                node->qlog.p_metrics.spill_files_size = 0;
 		node->num_metrics_packets = 0;
 
 		node->qexec_hash = apr_hash_make(agg->pool);
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.c b/contrib/perfmon/src/gpmmon/gpmondb.c
index 8709019d846..199ebb13a1e 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.c
+++ b/contrib/perfmon/src/gpmmon/gpmondb.c
@@ -720,7 +720,7 @@ void gpdb_get_single_string_from_query(const char* QUERY, char** resultstring, a
 
 void gpdb_get_spill_file_size_from_query(qdnode_t *qdnode)
 {
-        char query[100];
+        char query[200];
         snprintf(query, sizeof(query), "select sum(size) from gp_toolkit.gp_workfile_usage_per_query where sess_id=%d And command_cnt=%d;",
                  qdnode->qlog.key.ssid,qdnode->qlog.key.ccnt);
 
@@ -751,9 +751,9 @@ void gpdb_get_spill_file_size_from_query(qdnode_t *qdnode)
 
         if (tmpoutput)
         {
-                uint64_t temp_result;
+                uint64_t temp_result = 0;
                 sscanf(tmpoutput, "%lu", &temp_result);
-                if (temp_result > 0)
+                if (temp_result > 0 && temp_result > qdnode->qlog.p_metrics.spill_files_size)
                 {
                         qdnode->qlog.p_metrics.spill_files_size = temp_result;
                 }
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index 5c5e0e63822..9519b32f435 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -62,6 +62,8 @@ struct pidrec_t
 	gpmon_proc_metrics_t p_metrics;
 	apr_uint64_t cpu_elapsed;
 	gpmon_qlogkey_t query_key;
+        gpmon_query_seginfo_key_t qseg_key;
+        gpmon_qexec_hash_key_t hash_key;
 };
 
 typedef struct gx_t gx_t;
@@ -96,9 +98,9 @@ struct gx_t
 	const char* hostname; /* my hostname */
 
 	/* hash tables */
-	apr_hash_t* qexectab; /* stores qexec packets hashdata-lightning not use*/
+	//apr_hash_t* qexectab; /* stores qexec packets hashdata-lightning not use*/
 	apr_hash_t* qlogtab; /* stores qlog packets */
-	apr_hash_t* segmenttab; /* stores segment packets hashdata-lightning not use*/
+	//apr_hash_t* segmenttab; /* stores segment packets hashdata-lightning not use*/
 	apr_hash_t* pidtab; /* key=pid, value=pidrec_t */
 	apr_hash_t* querysegtab; /* stores gpmon_query_seginfo_t */
 };
@@ -250,7 +252,7 @@ static void send_smon_to_mon_pkt(SOCKET sock, gp_smon_to_mmon_packet_t* pkt)
 	TR2(("Sent packet of type %d to mmon\n", pkt->header.pkttype));
 }
 
-static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt)
+static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt)
 {
 	apr_int32_t status;
 	sigar_proc_cpu_t cpu;
@@ -260,7 +262,7 @@ static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid,
 	pidrec_t* rec;
 	apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
 
-	rec = apr_hash_get(gx.pidtab, &pid, sizeof(pid));
+	rec = apr_hash_get(gx.pidtab, &key.pid, sizeof(key.pid));
 	if (rec && rec->updated_tick == gx.tick)
 		return; /* updated in current cycle */
 
@@ -268,7 +270,7 @@ static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid,
 	memset(&mem, 0, sizeof(mem));
 	memset(&fd, 0, sizeof(fd));
 
-	TR2(("--------------------- starting %d\n", pid));
+	TR2(("--------------------- starting %d\n", key.pid));
 
 	if (!rec)
 	{
@@ -279,13 +281,22 @@ static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid,
 		rec = apr_pcalloc(pool, sizeof(*rec));
 		CHECKMEM(rec);
 
-		rec->pid = pid;
+                gpmon_query_seginfo_key_t	qseg_key;
+                qseg_key.qkey.tmid = tmid;
+                qseg_key.qkey.ssid = ssid;
+                qseg_key.qkey.ccnt = ccnt;
+                qseg_key.segid = key.segid;
+
+
+		rec->pid = key.pid;
 		rec->query_key.tmid = tmid;
 		rec->query_key.ssid = ssid;
 		rec->query_key.ccnt = ccnt;
+                rec->qseg_key = qseg_key;
+                rec->hash_key = key;
 
 		rec->pname = rec->cwd = 0;
-		if (0 == sigar_proc_exe_get(gx.sigar, pid, &exe))
+		if (0 == sigar_proc_exe_get(gx.sigar, key.pid, &exe))
 		{
 			rec->pname = apr_pstrdup(pool, exe.name);
 			rec->cwd = apr_pstrdup(pool, exe.root);
@@ -298,30 +309,30 @@ static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid,
 		apr_hash_set(gx.pidtab, &rec->pid, sizeof(rec->pid), rec);
 	}
 
-	status = sigar_proc_mem_get(gx.sigar, pid, &mem);
+	status = sigar_proc_mem_get(gx.sigar, key.pid, &mem);
 	/* ESRCH is error 3: (No such process) */
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
 		}
 		return;
 	}
 
-	status = sigar_proc_cpu_get(gx.sigar, pid, &cpu);
+	status = sigar_proc_cpu_get(gx.sigar, key.pid, &cpu);
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
 		}
 		return;
 	}
 
-	status = sigar_proc_fd_get(gx.sigar, pid, &fd);
+	status = sigar_proc_fd_get(gx.sigar, key.pid, &fd);
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
 		}
 		return;
 	}
@@ -331,7 +342,7 @@ static void get_pid_metrics(apr_int32_t pid, apr_int32_t tmid, apr_int32_t ssid,
 	{
 		if (status != ESRCH)
                 {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), pid));
+			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
 		}
 		return;
 	}
@@ -650,10 +661,8 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 	char dump;
 	int n, e;
 	apr_pool_t* oldpool;
-	apr_hash_t* qetab;
 	apr_hash_t* qdtab;
 	apr_hash_t* pidtab;
-	apr_hash_t* segtab;
 	if (event & EV_TIMEOUT) // didn't get command from gpmmon, quit
 	{
 		if(gx.tcp_sock)
@@ -678,13 +687,11 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 
 	TR1(("start dump %c\n", dump));
 
-	qetab = gx.qexectab;
 	qdtab = gx.qlogtab;
 	pidtab = gx.pidtab;
-	segtab = gx.segmenttab;
 	querysegtab = gx.querysegtab;
 
-	oldpool = apr_hash_pool_get(qetab);
+	oldpool = apr_hash_pool_get(querysegtab);
 
 	/* make new  hashtabs for next cycle */
 	{
@@ -693,18 +700,11 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		{
 			gpsmon_fatalx(FLINE, e, "apr_pool_create_alloc failed");
 		}
-		/* qexec hash table */
-		gx.qexectab = apr_hash_make(newpool);
-		CHECKMEM(gx.qexectab);
 
 		/* qlog hash table */
 		gx.qlogtab = apr_hash_make(newpool);
 		CHECKMEM(gx.qlogtab);
 
-		/* segment hash table */
-		gx.segmenttab = apr_hash_make(newpool);
-		CHECKMEM(gx.segmenttab);
-
 		/* queryseg hash table */
 		gx.querysegtab = apr_hash_make(newpool);
 		CHECKMEM(gx.querysegtab);
@@ -726,25 +726,7 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		pidrec_t* pidrec;
 		int count = 0;
 		apr_hash_t* query_cpu_table = NULL;
-		sigar_proc_state_t state;
-
-		/*
-		for (hi = apr_hash_first(0, segtab); hi; hi = apr_hash_next(hi))
-		{
- 			void* vptr;
-			apr_hash_this(hi, 0, 0, &vptr);
-			ppkt = vptr;
-			if (ppkt->header.pkttype != GPMON_PKTTYPE_SEGINFO)
-				continue;
-
-			// fill in hostname 
-			strncpy(ppkt->u.seginfo.hostname, gx.hostname, sizeof(ppkt->u.seginfo.hostname) - 1);
-			ppkt->u.seginfo.hostname[sizeof(ppkt->u.seginfo.hostname) - 1] = 0;
-
-			TR2(("sending magic %x, pkttype %d\n", ppkt->header.magic, ppkt->header.pkttype));
-			send_smon_to_mon_pkt(sock, ppkt);
-			count++;
-		}*/
+                sigar_proc_state_t state;
 
 		for (hi = apr_hash_first(0, qdtab); hi; hi = apr_hash_next(hi))
 		{
@@ -753,59 +735,13 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			ppkt = vptr;
 			if (ppkt->header.pkttype != GPMON_PKTTYPE_QLOG)
 				continue;
-			TR2(("sending magic %x, pkttype %d\n", ppkt->header.magic, ppkt->header.pkttype));
-			send_smon_to_mon_pkt(sock, ppkt);
-			count++;
-		}
-
-		/*
-		 * QUERYSEG packets must be sent after QLOG packets so that gpmmon can
-		 * correctly populate its query_seginfo_hash.
-		 */
-		for (hi = apr_hash_first(0, querysegtab); hi; hi = apr_hash_next(hi))
-		{
- 			void* vptr;
-			apr_hash_this(hi, 0, 0, &vptr);
-			ppkt = vptr;
-			if (ppkt->header.pkttype != GPMON_PKTTYPE_QUERYSEG)
-				continue;
 
-			TR2(("sending magic %x, pkttype %d\n", ppkt->header.magic, ppkt->header.pkttype));
+			TR2(("%s: sending magic %x, pkttype %d, %d-%d-%d\n", FLINE, ppkt->header.magic, ppkt->header.pkttype,
+                                ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt));
 			send_smon_to_mon_pkt(sock, ppkt);
 			count++;
 		}
 
-		/*
-		for (hi = apr_hash_first(0, qetab); hi; hi = apr_hash_next(hi))
-		{
-			gpmon_qexec_t* qexec;
-			void *vptr;
-
-			apr_hash_this(hi, 0, 0, &vptr);
-                        qexec = vptr;
-                        // fill in _p_metrics
-                        pidrec = apr_hash_get(pidtab, &qexec->key.hash_key.pid, sizeof(qexec->key.hash_key.pid));
-                        if (pidrec) {
-                        qexec->_p_metrics = pidrec->p_metrics;
-                        qexec->_cpu_elapsed = pidrec->cpu_elapsed;
-                        } else {
-                        memset(&qexec->_p_metrics, 0, sizeof(qexec->_p_metrics));
-                        }
-
-			// fill in _hname 
-			strncpy(qexec->_hname, gx.hostname, sizeof(qexec->_hname) - 1);
-			qexec->_hname[sizeof(qexec->_hname) - 1] = 0;
-
-			if (0 == create_qexec_packet(qexec, &localPacketObject)) {
-				break;
-			}
-
-                        TR2(("sending qexec, pkttype %d\n", localPacketObject.header.pkttype));
-                        send_smon_to_mon_pkt(sock, &localPacketObject);
-                        count++;
-		}
-                */
-
 		// calculate CPU utilization And Memory utilization per query for this machine
 		query_cpu_table = apr_hash_make(oldpool);
 		CHECKMEM(query_cpu_table);
@@ -815,13 +751,18 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		{
 			void* vptr;
 			pidrec_t* lookup;
+                        pidrec_t* pidrec;
 
 			apr_hash_this(hi, 0, 0, &vptr);
 			pidrec = vptr;
+                        if (!pidrec)
+                        {
+                                continue;
+                        }
 
-			TR2(("tmid %d ssid %d ccnt %d pid %d (CPU elapsed %ld CPU Percent %.2f)\n",
-				pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
-				pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct));
+			TR2(("%s: %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
+				FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
+				pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct, pidrec->p_metrics.mem.size));
 
 			// table is keyed on query key
 			lookup = apr_hash_get(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key));
@@ -843,14 +784,59 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 				apr_hash_set(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key), pidrec);
 			}
 
+
+                        // add to queryseg hash table
+                        gp_smon_to_mmon_packet_t*  rec;
+                        rec = apr_hash_get(querysegtab, &pidrec->qseg_key, sizeof(pidrec->qseg_key));
+                        if (rec)
+                        {
+                                rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
+                        }
+                        else
+                        {
+                                rec = apr_palloc(apr_hash_pool_get(querysegtab),sizeof(gp_smon_to_mmon_packet_t));
+                                CHECKMEM(rec);
+                                gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
+                                rec->u.queryseg.key = pidrec->qseg_key;
+                                rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
+                                apr_hash_set(querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
+                        }
+
 			//add to new pidtab if process is exist
 			int status = sigar_proc_state_get(gx.sigar,pidrec->pid, &state);
                         if (status == SIGAR_OK)
                         {
-                                apr_hash_set(gx.pidtab, &pidrec->pid, sizeof(pidrec->pid), pidrec);
+                                apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
+                                pidrec_t* newpidrec = apr_palloc(pool, sizeof(*pidrec));
+                                memcpy(newpidrec, pidrec, sizeof(*pidrec));
+                                apr_hash_set(gx.pidtab, &newpidrec->pid, sizeof(newpidrec->pid), newpidrec);
+                                TR2(("%s: %d-%d-%d pid %d add to new pidtab \n",
+                                        FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid));
+                                continue;
     		        }
+                        TR2(("%s: %d-%d-%d pid %d pid status %d not add to new pidtab \n",
+                                FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid, status));
 		}
 
+
+                /*
+                * QUERYSEG packets must be sent after QLOG packets so that gpmmon can
+                * correctly populate its query_seginfo_hash.
+                */
+                for (hi = apr_hash_first(0, querysegtab); hi; hi = apr_hash_next(hi))
+                {
+                        void* vptr;
+                        apr_hash_this(hi, 0, 0, &vptr);
+                        ppkt = vptr;
+                        if (ppkt->header.pkttype != GPMON_PKTTYPE_QUERYSEG)
+                                continue;
+
+                        TR2(("%s: sending magic %x, pkttype %d, %d-%d-%d\n", FLINE, ppkt->header.magic, ppkt->header.pkttype,
+                                ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt));
+                        send_smon_to_mon_pkt(sock, ppkt);
+                        count++;
+                }
+
 		// reset packet to 0
 		ppkt = &localPacketObject;
 		memset(ppkt, 0, sizeof(gp_smon_to_mmon_packet_t));
@@ -876,9 +862,9 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			ppkt->u.qlog.p_metrics.mem = pidrec->p_metrics.mem;
 			ppkt->u.qlog.pid = pidrec->pid;
 
-			TR2(("SEND tmid %d ssid %d ccnt %d (CPU elapsed %ld CPU Percent %.2f)\n",
-				ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt,
-				ppkt->u.qlog.cpu_elapsed, ppkt->u.qlog.p_metrics.cpu_pct));
+			TR2(("%s: SEND %d-%d-%d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
+				FLINE, ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt,
+				ppkt->u.qlog.cpu_elapsed, ppkt->u.qlog.p_metrics.cpu_pct, ppkt->u.qlog.p_metrics.mem.size));
 
 			send_smon_to_mon_pkt(sock, ppkt);
 			count++;
@@ -890,6 +876,9 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 	/* get rid of the old pool */
 	{
 		apr_pool_destroy(oldpool);
+                qdtab = NULL;
+                pidtab = NULL;
+                querysegtab = NULL;
 	}
 	struct timeval tv;
 	tv.tv_sec = opt.terminate_timeout;
@@ -1003,7 +992,7 @@ static void gx_recvqlog(gpmon_packet_t* pkt)
 		gpsmon_fatal(FLINE, "assert failed; expected pkttype qlog");
 
 	p = &pkt->u.qlog;
-	TR2(("Received qlog packet for query %d-%d-%d.  Status now %d\n", p->key.tmid, p->key.ssid, p->key.ccnt, p->status));
+	TR2(("%s Received qlog packet for query %d-%d-%d pid %d Status now %d\n", FLINE, p->key.tmid, p->key.ssid, p->key.ccnt, p->pid, p->status));
 	rec = apr_hash_get(gx.qlogtab, &p->key, sizeof(p->key));
 	if (rec)
 	{
@@ -1016,112 +1005,23 @@ static void gx_recvqlog(gpmon_packet_t* pkt)
 	}
 }
 
-static void gx_recvsegment(gpmon_packet_t* pkt)
-{
-	gpmon_seginfo_t* p;
-	gp_smon_to_mmon_packet_t* rec;
-
-	if (pkt->pkttype != GPMON_PKTTYPE_SEGINFO)
-		gpsmon_fatal(FLINE, "assert failed; expected pkttype segment");
-
-	p = &pkt->u.seginfo;
-
-	TR2(("Received segment packet for dbid %d (dynamic_memory_used, dynamic_memory_available) (%lu %lu)\n", p->dbid, p->dynamic_memory_used, p->dynamic_memory_available));
-
-	rec = apr_hash_get(gx.segmenttab, &p->dbid, sizeof(p->dbid));
-	if (rec)
-	{
-		memcpy(&rec->u.seginfo, p, sizeof(*p));
-	}
-	else
-	{
-		rec = gx_pkt_to_smon_to_mmon(apr_hash_pool_get(gx.segmenttab), pkt);
-		apr_hash_set(gx.segmenttab, &rec->u.seginfo.dbid, sizeof(rec->u.seginfo.dbid), rec);
-	}
-}
-
-/**
-* write the qexec packet.
-* @return 1 if success, 0 if failure
-*/
-// static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt)
-// {
-// 	// Copy over needed values
-// 	memcpy(&pkt->u.qexec_packet.data.key, &qexec->key, sizeof(gpmon_qexeckey_t));
-// 	pkt->u.qexec_packet.data.measures_rows_in = qexec->rowsout;
-// 	pkt->u.qexec_packet.data._cpu_elapsed = qexec->_cpu_elapsed;
-// 	pkt->u.qexec_packet.data.rowsout = qexec->rowsout;
-
-// 	gp_smon_to_mmon_set_header(pkt,GPMON_PKTTYPE_QEXEC);
-// 	return 1;
-// }
-
-static void extract_segments_exec(gpmon_packet_t* pkt)
-{
-	gpmon_qexec_t				*p;
-	gp_smon_to_mmon_packet_t	*rec;
-	gpmon_query_seginfo_key_t	qseg_key;
-	pidrec_t					*pidrec;
-
-	if (pkt->pkttype != GPMON_PKTTYPE_QEXEC)
-		gpsmon_fatal(FLINE, "assert failed; expected pkttype qexec");
-
-	p = &pkt->u.qexec;
-	qseg_key.qkey.tmid = p->key.tmid;
-	qseg_key.qkey.ssid = p->key.ssid;
-	qseg_key.qkey.ccnt = p->key.ccnt;
-	qseg_key.segid = p->key.hash_key.segid;
-
-	rec = apr_hash_get(gx.querysegtab, &qseg_key, sizeof(qseg_key));
-	pidrec = apr_hash_get(gx.pidtab, &p->key.hash_key.pid, sizeof(p->key.hash_key.pid));
-	ASSERT(pidrec);
-
-	if (rec)
-	{
-		rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
-		// rec->u.queryseg.sum_measures_rows_out += p->rowsout;
-		// if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1 && (int64)(p->rowsout) > rec->u.queryseg.final_rowsout)
-		// {
-		// 	rec->u.queryseg.final_rowsout = p->rowsout;
-		// }
-	}
-	else
-	{
-		rec = apr_palloc(apr_hash_pool_get(gx.querysegtab),
-					sizeof(gp_smon_to_mmon_packet_t));
-		CHECKMEM(rec);
-		gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
-		rec->u.queryseg.key = qseg_key;
-		// if (p->key.hash_key.segid == -1 && p->key.hash_key.nid == 1)
-		// {
-		// 	rec->u.queryseg.final_rowsout = p->rowsout;
-		// }
-		// else
-		// {
-		// 	rec->u.queryseg.final_rowsout = -1;
-		// }
-		rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
-		//rec->u.queryseg.sum_measures_rows_out = p->rowsout;
-		apr_hash_set(gx.querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
-	}
-}
-
 static void gx_recvqexec(gpmon_packet_t* pkt)
 {
 	gpmon_qexec_t* p;
 
 	if (pkt->pkttype != GPMON_PKTTYPE_QEXEC)
 		gpsmon_fatal(FLINE, "assert failed; expected pkttype qexec");
-    TR2(("received qexec packet\n"));
+
+        TR2(("%s received qexec packet %d-%d-%d pid %d\n", FLINE, pkt->u.qlog.key.tmid, pkt->u.qlog.key.ssid, pkt->u.qlog.key.ccnt, pkt->u.qlog.pid));
 
 	p = &pkt->u.qexec;
-	get_pid_metrics(p->key.hash_key.pid,
+	get_pid_metrics(p->key.hash_key,
 					p->key.tmid,
 					p->key.ssid,
 					p->key.ccnt);
 	// Store some aggregated information somewhere for metrics in
 	// queries_* tables, like cpu_elapsed, rows_out, and etc.
-	extract_segments_exec(pkt);
+	//extract_segments_exec(pkt);
 	// We don't call gpmon_warning here because the number of
 	// packet is big, and we would make log boating.
 	return;
@@ -1167,9 +1067,6 @@ static void gx_recvfrom(SOCKET sock, short event, void* arg)
 	case GPMON_PKTTYPE_QLOG:
 		gx_recvqlog(&pkt);
 		break;
-	case GPMON_PKTTYPE_SEGINFO:
-		gx_recvsegment(&pkt);
-		break;
 	case GPMON_PKTTYPE_QEXEC:
 		gx_recvqexec(&pkt);
 		break;
@@ -1448,16 +1345,16 @@ static void setup_gx(int port, apr_int64_t signature)
 	}
 
 	/* qexec hash table */
-	gx.qexectab = apr_hash_make(subpool);
-	CHECKMEM(gx.qexectab);
+	//gx.qexectab = apr_hash_make(subpool);
+	//CHECKMEM(gx.qexectab);
 
 	/* qlog hash table */
 	gx.qlogtab = apr_hash_make(subpool);
 	CHECKMEM(gx.qlogtab);
 
 	/* segment hash table */
-	gx.segmenttab = apr_hash_make(subpool);
-	CHECKMEM(gx.segmenttab);
+	//gx.segmenttab = apr_hash_make(subpool);
+	//CHECKMEM(gx.segmenttab);
 
 	/* queryseg hash table */
 	gx.querysegtab = apr_hash_make(subpool);
@@ -1644,7 +1541,9 @@ void gx_main(int port, apr_int64_t signature)
                         rec = vptr;
 			if (rec)
                         {
-				get_pid_metrics(rec->pid,
+                                TR2(("%s: %d-%d-%d pid %d refresh process metrics \n ",
+                                        FLINE, rec->query_key.tmid, rec->query_key.ssid, rec->query_key.ccnt, rec->pid));
+				get_pid_metrics(rec->hash_key,
                                 rec->query_key.tmid,
                                 rec->query_key.ssid,
                                 rec->query_key.ccnt);

From 363908e9b92098bd1f478174facc74196f4fc1d5 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 19 Jul 2024 11:54:42 +0800
Subject: [PATCH 15/40] Add pg_query_state

https://github.com/postgrespro/pg_query_state

Based on the above repo and apply it on hashdata-lightning.
The pg_query_state module provides facility to know the current state of
query execution on working backend.

To use it, you should config shared_preload_libraries and create
extension:
shared_preload_libraries = 'pg_query_state'
CREATE EXTENSION pg_query_state;

The original pg_query_state sends signal to the query process. Then the
query process sends the query state back through shm_mq.

In this pr, we modify the origin pg_query_state. Using
cbdb_mpp_query_state to collect query state from all qEs, and send them
to the query qD process by shm_mq. Then the qD process used them to
generate the final result like explain analyze.

Beside the runtime query state, we also collect the query state at the
end of the query in function qs_ExecutorEnd and save them in the perfmon
tables.

-----
gpmon.c update

if the pq_query_state is enabled (enable_qs_runtime), then it fetch the query
plan from CachedQueryStateInfo which is generated by pg_query_state at
the end of the query. Otherwise, it still fetch the plan by 'explain' command
---
 contrib/perfmon/perfmon--1.0.0--1.1.0.sql     |   42 +-
 contrib/perfmon/perfmon.control               |    2 +-
 contrib/perfmon/perfmon.sql                   |   49 +-
 contrib/perfmon/src/gpmon/Makefile            |    4 +-
 contrib/perfmon/src/gpmon/gpmon.c             |   61 +-
 contrib/perfmon/src/gpmon/pg_query_state.c    | 1738 +++++
 contrib/perfmon/src/gpmon/pg_query_state.h    |  173 +
 contrib/perfmon/src/gpmon/signal_handler.c    |  778 +++
 contrib/perfmon/src/gpsmon/gpsmon.c           |    1 -
 contrib/perfmon/src/include/gpmon.h           |    2 +-
 .../vectorization/src/backend/hook/explain.c  | 6049 +++++++++++++++++
 src/backend/commands/explain.c                |  146 +-
 src/backend/commands/explain_gp.c             |  396 +-
 src/backend/executor/instrument.c             |    6 +
 src/backend/storage/ipc/procsignal.c          |  105 +
 src/backend/tcop/postgres.c                   |    5 +-
 src/include/cdb/cdbexplain.h                  |   10 +-
 src/include/commands/explain.h                |    2 +
 src/include/executor/instrument.h             |    8 +
 src/include/storage/procsignal.h              |   17 +
 src/include/utils/metrics_utils.h             |    1 +
 .../query_info_hook_test.c                    |    2 +
 22 files changed, 9512 insertions(+), 85 deletions(-)
 create mode 100644 contrib/perfmon/src/gpmon/pg_query_state.c
 create mode 100644 contrib/perfmon/src/gpmon/pg_query_state.h
 create mode 100644 contrib/perfmon/src/gpmon/signal_handler.c
 create mode 100644 contrib/vectorization/src/backend/hook/explain.c

diff --git a/contrib/perfmon/perfmon--1.0.0--1.1.0.sql b/contrib/perfmon/perfmon--1.0.0--1.1.0.sql
index c9527a65f66..12c65fe8c62 100644
--- a/contrib/perfmon/perfmon--1.0.0--1.1.0.sql
+++ b/contrib/perfmon/perfmon--1.0.0--1.1.0.sql
@@ -20,4 +20,44 @@ ALTER FOREIGN TABLE _queries_tail
 ADD COLUMN mem_peak BIGINT NOT NULL,
 ADD COLUMN spill_file_size BIGINT NOT NULL,
 ADD COLUMN disk_read BIGINT NOT NULL,
-ADD COLUMN disk_write BIGINT NOT NULL;
\ No newline at end of file
+ADD COLUMN disk_write BIGINT NOT NULL;
+CREATE FUNCTION pg_query_state(pid 		integer
+			, verbose	boolean = FALSE
+			, costs 	boolean = FALSE
+			, timing 	boolean = FALSE
+			, buffers 	boolean = FALSE
+			, triggers	boolean = FALSE
+			, format	text = 'text')
+	RETURNS TABLE (pid integer
+			, frame_number integer
+			, query_text text
+			, plan text
+			, leader_pid integer)
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION cbdb_mpp_query_state(gp_segment_pid[])
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION query_state_pause()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION query_state_resume()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION query_state_pause_command()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION query_state_resume_command()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
diff --git a/contrib/perfmon/perfmon.control b/contrib/perfmon/perfmon.control
index 0bd3cd84ee2..be92705020a 100644
--- a/contrib/perfmon/perfmon.control
+++ b/contrib/perfmon/perfmon.control
@@ -1,5 +1,5 @@
 comment = 'data type for storing sets of (key, value) pairs'
 default_version = '1.1.0'
-module_pathname = '$libdir/perfmon'
+module_pathname = '$libdir/gpmon'
 relocatable = true
 trusted = true
diff --git a/contrib/perfmon/perfmon.sql b/contrib/perfmon/perfmon.sql
index ccfd80c21a5..fb457d05074 100644
--- a/contrib/perfmon/perfmon.sql
+++ b/contrib/perfmon/perfmon.sql
@@ -1,4 +1,7 @@
--- Only can be installed in gpperfmon databse
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION perfmon" to load this file. \quit
+
+-- Only can be installed in gpperfmon database
 CREATE OR REPLACE FUNCTION checkdbname() 
 RETURNS void
 AS $$
@@ -367,3 +370,47 @@ CREATE TABLE worksheet_versions (
     uuid VARCHAR(1000) DEFAULT '-1'
 );
 RESET search_path;
+
+CREATE FUNCTION pg_query_state(pid               integer
+				, verbose	boolean = FALSE
+				, costs 	boolean = FALSE
+				, timing 	boolean = FALSE
+				, buffers 	boolean = FALSE
+				, triggers	boolean = FALSE
+				, format	text = 'text')
+	RETURNS TABLE (pid integer
+				 , frame_number integer
+				 , query_text text
+				 , plan text
+				 , leader_pid integer)
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE TYPE gp_segment_pid AS (
+    "segid" int4,
+    "TABLE_SIZE" int4
+);
+
+CREATE FUNCTION cbdb_mpp_query_state(gp_segment_pid[])
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION query_state_pause()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION query_state_resume()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+CREATE FUNCTION query_state_pause_command()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION query_state_resume_command()
+	RETURNS void
+	AS 'MODULE_PATHNAME'
+	LANGUAGE C STRICT VOLATILE;
\ No newline at end of file
diff --git a/contrib/perfmon/src/gpmon/Makefile b/contrib/perfmon/src/gpmon/Makefile
index 777183f07de..f41f18fd9f6 100644
--- a/contrib/perfmon/src/gpmon/Makefile
+++ b/contrib/perfmon/src/gpmon/Makefile
@@ -1,8 +1,8 @@
 top_builddir = ../../../../
 
 MODULE_big = gpmon
-OBJS = gpmon.o
-PG_CPPFLAGS = -I../include
+OBJS = gpmon.o pg_query_state.o signal_handler.o
+PG_CPPFLAGS = -I../include -I$(libpq_srcdir)
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 244bd172c98..e4f677162cb 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -19,13 +19,15 @@
 #include "access/xact.h"
 #include "cdb/cdbtm.h"
 #include "cdb/cdbvars.h"
+#include "commands/explain.h"
 #include "executor/executor.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/metrics_utils.h"
 #include "utils/metrics_utils.h"
 #include "utils/snapmgr.h"
-#include "commands/explain.h"
+
+#include "pg_query_state.h"
 
 PG_MODULE_MAGIC;
 static int32 init_tmid = -1;;
@@ -380,7 +382,7 @@ void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket)
 /**
  * Call this method when query finishes executing.
  */
-void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket)
+void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, bool updateRecord)
 {
 	struct timeval tv;
 
@@ -391,12 +393,12 @@ void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket)
 	gpmonPacket->u.qlog.tsubmit = tsubmit;
 	gpmonPacket->u.qlog.tstart = tstart;
 	gpmonPacket->u.qlog.tfin = tv.tv_sec;
-	
-	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
-			gpmonPacket->u.qlog.key.ssid,
-			gpmonPacket->u.qlog.key.ccnt,
-			gpmonPacket->u.qlog.status);
-	
+	if (updateRecord)
+		gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
+							gpmonPacket->u.qlog.key.ssid,
+							gpmonPacket->u.qlog.key.ccnt,
+							gpmonPacket->u.qlog.status);
+
 	gpmon_send(gpmonPacket);
 }
 
@@ -450,6 +452,7 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 	char *query_text;
 	char *plan;
 	QueryDesc *qd = (QueryDesc *)queryDesc;
+	bool updateRecord = true;
 	if (perfmon_enabled && qd != NULL)
 	{
 		gpmon_packet_t *gpmonPacket = NULL;
@@ -475,7 +478,21 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 					gpmon_qlog_query_submit(gpmonPacket);
 					break;
 				case METRICS_QUERY_DONE:
-					gpmon_qlog_query_end(gpmonPacket);
+					if (enable_qs_runtime() && CachedQueryStateInfo != NULL &&
+						CachedQueryStateInfo->gp_command_count == gp_command_count)
+					{
+						query_text = get_query_text(qd);
+						plan = (char *)CachedQueryStateInfo->data;
+						gpmon_qlog_query_text(gpmonPacket,
+											  query_text,
+											  plan,
+											  application_name,
+											  NULL,
+											  NULL,
+											  GPMON_QLOG_STATUS_DONE);
+						updateRecord = false;
+					}
+					gpmon_qlog_query_end(gpmonPacket, updateRecord);
 					break;
 					/* TODO: no GPMON_QLOG_STATUS for METRICS_QUERY_CANCELED */
 				case METRICS_QUERY_CANCELING:
@@ -486,16 +503,19 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 					gpmon_qlog_query_error(gpmonPacket);
 					break;
 				case METRICS_PLAN_NODE_INITIALIZE:
-					query_text = get_query_text(qd);
-					plan = get_plan(qd);
-					gpmon_qlog_query_text(gpmonPacket,
-							query_text,
-							plan,
-							application_name,
-							NULL,
-							NULL,
-							GPMON_QLOG_STATUS_START);
-					pfree(plan);
+					if (!enable_qs_runtime())
+					{
+						query_text = get_query_text(qd);
+						plan = get_plan(qd);
+						gpmon_qlog_query_text(gpmonPacket,
+											  query_text,
+											  plan,
+											  application_name,
+											  NULL,
+											  NULL,
+											  GPMON_QLOG_STATUS_START);
+						pfree(plan);
+					}
 					break;
 				default:
 					break;
@@ -563,6 +583,7 @@ _PG_init(void)
 	}
 	init_tmid = t;
 	gpmon_init();
+	init_pg_query_state();
 }
 
 void
@@ -583,8 +604,8 @@ char* get_plan(QueryDesc *queryDesc)
 	es->summary = es->analyze;
 	es->format = EXPLAIN_FORMAT_JSON;
 	es->settings = true;
-
 	ExplainBeginOutput(es);
+	ExplainQueryText(es, queryDesc);
 	ExplainPrintPlan(es, queryDesc);
 	ExplainEndOutput(es);
 
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.c b/contrib/perfmon/src/gpmon/pg_query_state.c
new file mode 100644
index 00000000000..d75c5c05fa3
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/pg_query_state.c
@@ -0,0 +1,1738 @@
+/*
+ * pg_query_state.c
+ *		Extract information about query state from other backend
+ *
+ * Copyright (c) 2016-2024, Postgres Professional
+ *
+ *	  contrib/pg_query_state/pg_query_state.c
+ * IDENTIFICATION
+ */
+#include "pg_query_state.h"
+
+#include "access/htup_details.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "executor/execParallel.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/print.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "storage/ipc.h"
+#include "storage/s_lock.h"
+#include "storage/spin.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/shm_toc.h"
+#include "utils/guc.h"
+#include "utils/timestamp.h"
+#include "cdb/cdbdispatchresult.h"
+#include "cdb/cdbdisp_query.h"
+#include "cdb/cdbexplain.h"
+#include "cdb/cdbvars.h"
+#include "libpq-fe.h"
+#include "libpq/pqformat.h"
+#include "fmgr.h"
+#include "utils/lsyscache.h"
+#include "utils/typcache.h"
+#include "libpq-int.h"
+
+#define TEXT_CSTR_CMP(text, cstr) \
+	(memcmp(VARDATA(text), (cstr), VARSIZE(text) - VARHDRSZ))
+#define HEADER_LEN sizeof(int) * 2
+
+/* GUC variables */
+bool pg_qs_enable = true;
+bool pg_qs_timing = false;
+bool pg_qs_buffers = false;
+StringInfo queryStateData = NULL;
+volatile pg_atomic_uint32 *pg_qs_on;
+/*
+ * CachedQueryStateInfo both exists on QE and QD
+ *
+ * On QE, it is used by pg_query_state. When one
+ * query node is finished, the query maybe still
+ * running. So then pg_query_state is called to
+ * fetch the whole query running state, use it for
+ * the finished query node. We cached the query
+ * state info at end the query. And reset it
+ * when next query starts.
+ * 
+ * On QD, it is used to cache the whole query
+ * state info. And gpmon_query_info_collect_hook
+ * will send it to gpsmon. Also reset it when
+ * next query starts.
+ */
+query_state_info *CachedQueryStateInfo = NULL;
+MemoryContext queryStateCtx = NULL;
+
+/* Saved hook values in case of unload */
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+/* hooks defined in this module */
+static void qs_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void qs_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction,
+						   uint64 count, bool execute_once);
+static void qs_ExecutorFinish(QueryDesc *queryDesc);
+static void qs_ExecutorEnd(QueryDesc *queryDesc);
+static void clear_queryStateInfo(void);
+static void
+set_CachedQueryStateInfo(int sliceIndex, StringInfo strInfo, int gp_command_count, int queryId);
+static shm_mq_result receive_msg_by_parts(shm_mq_handle *mqh, Size *total,
+										  void **datap, int64 timeout, int *rc, bool nowait);
+/* functions added by cbdb */
+static List *GetRemoteBackendInfo(PGPROC *proc);
+static CdbPgResults CollectQEQueryState(List *backendInfo);
+static List *get_query_backend_info(ArrayType *array);
+static shm_mq_msg *GetRemoteBackendQueryStates(CdbPgResults cdb_pgresults,
+										 PGPROC *proc,
+										 bool verbose,
+										 bool costs,
+										 bool timing,
+										 bool buffers,
+										 bool triggers,
+										 ExplainFormat format);
+static void qs_print_plan(qs_query *query);
+static bool filter_query_common(QueryDesc *queryDesc);
+	/* functions added by cbdb */
+
+/* important to record the info of the peer */
+static void check_and_init_peer(LOCKTAG *tag, PGPROC *proc, int n_peers);
+static shm_mq_msg *receive_final_query_state(void);
+static bool wait_for_mq_ready(shm_mq *mq);
+static List *get_cdbStateCells(CdbPgResults cdb_pgresults);
+static qs_query *push_query(QueryDesc *queryDesc);
+static void pop_query(void);
+
+/* Global variables */
+List 					*QueryDescStack = NIL;
+static ProcSignalReason UserIdPollReason = INVALID_PROCSIGNAL;
+static ProcSignalReason QueryStatePollReason = INVALID_PROCSIGNAL;
+static ProcSignalReason BackendInfoPollReason = INVALID_PROCSIGNAL;
+static bool 			module_initialized = false;
+static const char		*be_state_str[] = {						/* BackendState -> string repr */
+							"undefined",						/* STATE_UNDEFINED */
+							"idle",								/* STATE_IDLE */
+							"active",							/* STATE_RUNNING */
+							"idle in transaction",				/* STATE_IDLEINTRANSACTION */
+							"fastpath function call",			/* STATE_FASTPATH */
+							"idle in transaction (aborted)",	/* STATE_IDLEINTRANSACTION_ABORTED */
+							"disabled",							/* STATE_DISABLED */
+						};
+static int              reqid = 0;
+
+typedef struct
+{
+	slock_t	 mutex;		/* protect concurrent access to `userid` */
+	Oid		 userid;
+	Latch	*caller;
+	pg_atomic_uint32 n_peers;
+} RemoteUserIdResult;
+
+static void SendCurrentUserId(void);
+//static void SendBgWorkerPids(void);
+static Oid GetRemoteBackendUserId(PGPROC *proc);
+
+
+/* Shared memory variables */
+shm_toc			   *toc = NULL;
+RemoteUserIdResult *counterpart_userid = NULL;
+pg_qs_params   	   *params = NULL;
+shm_mq 			   *mq = NULL;
+
+/* Running on QE to collect query state from slices */
+PG_FUNCTION_INFO_V1(pg_query_state);
+PG_FUNCTION_INFO_V1(cbdb_mpp_query_state);
+PG_FUNCTION_INFO_V1(query_state_pause);
+PG_FUNCTION_INFO_V1(query_state_resume);
+PG_FUNCTION_INFO_V1(query_state_pause_command);
+PG_FUNCTION_INFO_V1(query_state_resume_command);
+/*
+ * Estimate amount of shared memory needed.
+ */
+static Size
+pg_qs_shmem_size()
+{
+	shm_toc_estimator	e;
+	Size				size;
+	int					nkeys;
+
+	shm_toc_initialize_estimator(&e);
+
+	nkeys = 3;
+
+	shm_toc_estimate_chunk(&e, sizeof(RemoteUserIdResult));
+	shm_toc_estimate_chunk(&e, sizeof(pg_qs_params));
+	shm_toc_estimate_chunk(&e, (Size) QUEUE_SIZE);
+
+	shm_toc_estimate_keys(&e, nkeys);
+	size = shm_toc_estimate(&e);
+
+
+	size = MAXALIGN(size) + MAXALIGN(sizeof(pg_atomic_uint32));
+	return size;
+}
+
+/*
+ * Distribute shared memory.
+ */
+static void
+pg_qs_shmem_startup(void)
+{
+	bool	found;
+	Size	shmem_size = pg_qs_shmem_size() - MAXALIGN(sizeof(pg_atomic_uint32));
+	void	*shmem;
+	int		num_toc = 0;
+
+	shmem = ShmemInitStruct("pg_query_state", shmem_size, &found);
+	if (!found)
+	{
+		toc = shm_toc_create(PG_QS_MODULE_KEY, shmem, shmem_size);
+
+		counterpart_userid = shm_toc_allocate(toc, sizeof(RemoteUserIdResult));
+		shm_toc_insert(toc, num_toc++, counterpart_userid);
+		SpinLockInit(&counterpart_userid->mutex);
+		pg_atomic_init_u32(&counterpart_userid->n_peers, 0);
+
+		params = shm_toc_allocate(toc, sizeof(pg_qs_params));
+		shm_toc_insert(toc, num_toc++, params);
+
+		mq = shm_toc_allocate(toc, QUEUE_SIZE);
+		shm_toc_insert(toc, num_toc++, mq);
+	}
+	else
+	{
+		toc = shm_toc_attach(PG_QS_MODULE_KEY, shmem);
+
+#if PG_VERSION_NUM < 100000
+		counterpart_userid = shm_toc_lookup(toc, num_toc++);
+		params = shm_toc_lookup(toc, num_toc++);
+		mq = shm_toc_lookup(toc, num_toc++);
+#else
+		counterpart_userid = shm_toc_lookup(toc, num_toc++, false);
+		params = shm_toc_lookup(toc, num_toc++, false);
+		mq = shm_toc_lookup(toc, num_toc++, false);
+#endif
+	}
+	pg_qs_on = (pg_atomic_uint32 *) ShmemInitStruct("pg_qs_on", sizeof(pg_atomic_uint32), &found);
+	if (!found)
+		pg_atomic_init_u32(pg_qs_on, 1);
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	module_initialized = true;
+}
+
+#if PG_VERSION_NUM >= 150000
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static void pg_qs_shmem_request(void);
+#endif
+
+/*
+ * Module load callback
+ */
+void
+init_pg_query_state(void)
+{
+#if PG_VERSION_NUM >= 150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = pg_qs_shmem_request;
+#else
+	RequestAddinShmemSpace(pg_qs_shmem_size());
+#endif
+
+	/* Register interrupt on custom signal of polling query state */
+	UserIdPollReason = RegisterCustomProcSignalHandler(SendCurrentUserId);
+	QueryStatePollReason = RegisterCustomProcSignalHandler(SendQueryState);
+	BackendInfoPollReason = RegisterCustomProcSignalHandler(SendCdbComponents);
+	if (QueryStatePollReason == INVALID_PROCSIGNAL
+		|| UserIdPollReason == INVALID_PROCSIGNAL
+		|| BackendInfoPollReason == INVALID_PROCSIGNAL)
+	{
+		ereport(WARNING, (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+						  errmsg("pg_query_state isn't loaded: insufficient custom ProcSignal slots")));
+		return;
+	}
+
+	/* Define custom GUC variables */
+	DefineCustomBoolVariable("pg_query_state.enable",
+							 "Enable module.",
+							 NULL,
+							 &pg_qs_enable,
+							 true,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+	DefineCustomBoolVariable("pg_query_state.enable_timing",
+							 "Collect timing data, not just row counts.",
+							 NULL,
+							 &pg_qs_timing,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+	DefineCustomBoolVariable("pg_query_state.enable_buffers",
+							 "Collect buffer usage.",
+							 NULL,
+							 &pg_qs_buffers,
+							 false,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+	EmitWarningsOnPlaceholders("pg_query_state");
+
+	/* Install hooks */
+	if (Gp_role == GP_ROLE_DISPATCH)
+	{
+		prev_ExecutorStart = ExecutorStart_hook;
+		ExecutorStart_hook = qs_ExecutorStart;
+	}
+	prev_ExecutorRun = ExecutorRun_hook;
+	ExecutorRun_hook = qs_ExecutorRun;
+	prev_ExecutorFinish = ExecutorFinish_hook;
+	ExecutorFinish_hook = qs_ExecutorFinish;
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = pg_qs_shmem_startup;
+
+	prev_ExecutorEnd = ExecutorEnd_hook;
+	ExecutorEnd_hook = qs_ExecutorEnd;
+}
+
+#if PG_VERSION_NUM >= 150000
+static void
+pg_qs_shmem_request(void)
+{
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+
+	RequestAddinShmemSpace(pg_qs_shmem_size());
+}
+#endif
+
+/*
+ * ExecutorStart hook:
+ * 		set up flags to store runtime statistics,
+ * 		push current query description in global stack
+ */
+static void
+qs_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	instr_time		starttime;
+	/* Enable per-node instrumentation */
+	if (enable_qs_runtime() && ((eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0) &&
+		Gp_role == GP_ROLE_DISPATCH && is_querystack_empty() &&
+		filter_query_common(queryDesc))
+	{
+		queryDesc->instrument_options |= INSTRUMENT_CDB;
+		queryDesc->instrument_options |= INSTRUMENT_ROWS;
+		if (pg_qs_timing)
+			queryDesc->instrument_options |= INSTRUMENT_TIMER;
+		if (pg_qs_buffers)
+			queryDesc->instrument_options |= INSTRUMENT_BUFFERS;
+
+		INSTR_TIME_SET_CURRENT(starttime);
+		queryDesc->showstatctx = cdbexplain_showExecStatsBegin(queryDesc,
+															   starttime);
+	}
+
+	if (prev_ExecutorStart)
+		prev_ExecutorStart(queryDesc, eflags);
+	else
+		standard_ExecutorStart(queryDesc, eflags);
+	if (enable_qs_runtime() && ((eflags & EXEC_FLAG_EXPLAIN_ONLY)) == 0 &&
+		  queryDesc->totaltime == NULL && Gp_role == GP_ROLE_DISPATCH
+		  && is_querystack_empty())
+	{
+		MemoryContext oldcxt;
+		oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+		MemoryContextSwitchTo(oldcxt);
+	}
+}
+
+/*
+ * ExecutorRun:
+ * 		Catch any fatal signals
+ */
+static void
+qs_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
+			   bool execute_once)
+{
+	/* Clear query state info if we are in a new query */
+	if (is_querystack_empty() && CachedQueryStateInfo != NULL)
+	{
+			clear_queryStateInfo();
+	}
+	push_query(queryDesc);
+
+	PG_TRY();
+	{
+		if (prev_ExecutorRun)
+#if PG_VERSION_NUM < 100000
+			prev_ExecutorRun(queryDesc, direction, count);
+		else
+			standard_ExecutorRun(queryDesc, direction, count);
+#else
+			prev_ExecutorRun(queryDesc, direction, count, execute_once);
+		else
+			standard_ExecutorRun(queryDesc, direction, count, execute_once);
+#endif
+		pop_query();
+	}
+	PG_CATCH();
+	{
+		pop_query();
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+}
+
+/*
+ * ExecutorFinish:
+ * 		Catch any fatal signals
+ */
+static void
+qs_ExecutorFinish(QueryDesc *queryDesc)
+{
+	push_query(queryDesc);
+	PG_TRY();
+	{
+		if (prev_ExecutorFinish)
+			prev_ExecutorFinish(queryDesc);
+		else
+			standard_ExecutorFinish(queryDesc);
+		pop_query();
+	}
+	PG_CATCH();
+	{
+		pop_query();
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Find PgBackendStatus entry
+ */
+static PgBackendStatus *
+search_be_status(int pid)
+{
+	int beid;
+
+	if (pid <= 0)
+		return NULL;
+
+	for (beid = 1; beid <= pgstat_fetch_stat_numbackends(); beid++)
+	{
+#if PG_VERSION_NUM >= 160000
+		PgBackendStatus *be_status = pgstat_get_beentry_by_backend_id(beid);
+#else
+		PgBackendStatus *be_status = pgstat_fetch_stat_beentry(beid);
+#endif
+
+		if (be_status && be_status->st_procpid == pid)
+			return be_status;
+	}
+
+	return NULL;
+}
+
+
+void
+UnlockShmem(LOCKTAG *tag)
+{
+	LockRelease(tag, ExclusiveLock, false);
+}
+
+void
+LockShmem(LOCKTAG *tag, uint32 key)
+{
+	LockAcquireResult result;
+	tag->locktag_field1 = PG_QS_MODULE_KEY;
+	tag->locktag_field2 = key;
+	tag->locktag_field3 = 0;
+	tag->locktag_field4 = 0;
+	tag->locktag_type = LOCKTAG_USERLOCK;
+	tag->locktag_lockmethodid = USER_LOCKMETHOD;
+	result = LockAcquire(tag, ExclusiveLock, false, false);
+	Assert(result == LOCKACQUIRE_OK);
+	elog(DEBUG1, "LockAcquireResult is OK %d", result);
+}
+
+/*
+ * Structure of stack frame of fucntion call which transfers through message queue
+ */
+typedef struct
+{
+	text	*query;
+	text	*plan;
+} stack_frame;
+
+/*
+ *	Convert serialized stack frame into stack_frame record
+ *		Increment '*src' pointer to the next serialized stack frame
+ */
+static stack_frame *
+deserialize_stack_frame(char **src)
+{
+	stack_frame *result = palloc(sizeof(stack_frame));
+	text		*query = (text *) *src,
+				*plan = (text *) (*src + INTALIGN(VARSIZE(query)));
+
+	result->query = palloc(VARSIZE(query));
+	memcpy(result->query, query, VARSIZE(query));
+	result->plan = palloc(VARSIZE(plan));
+	memcpy(result->plan, plan, VARSIZE(plan));
+
+	*src = (char *) plan + INTALIGN(VARSIZE(plan));
+	return result;
+}
+
+/*
+ * Convert serialized stack frames into List of stack_frame records
+ */
+static List *
+deserialize_stack(char *src, int stack_depth)
+{
+	List 	*result = NIL;
+	char	*curr_ptr = src;
+	int		 i;
+
+	for (i = 0; i < stack_depth; i++)
+	{
+		stack_frame	*frame = deserialize_stack_frame(&curr_ptr);
+		result = lappend(result, frame);
+	}
+
+	return result;
+}
+
+/*
+ * Implementation of pg_query_state function
+ */
+Datum
+pg_query_state(PG_FUNCTION_ARGS)
+{
+	typedef struct
+	{
+		PGPROC 		*proc;
+		ListCell 	*frame_cursor;
+		int			 frame_index;
+		List		*stack;
+	} proc_state;
+
+	/* multicall context type */
+	typedef struct
+	{
+		ListCell	*proc_cursor;
+		List		*procs;
+	} pg_qs_fctx;
+
+	FuncCallContext	*funcctx;
+	MemoryContext	oldcontext;
+	pg_qs_fctx		*fctx;
+#define		 N_ATTRS  5
+	pid_t			pid = PG_GETARG_INT32(0);
+	if (!enable_qs_runtime())
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("pg_query_state is not enabled or paused")));
+	if (Gp_role != GP_ROLE_DISPATCH)
+		ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						errmsg("pg_query_state can only be called on coordinator")));
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		LOCKTAG			 tag;
+		bool			 verbose = PG_GETARG_BOOL(1),
+						 costs = PG_GETARG_BOOL(2),
+						 timing = PG_GETARG_BOOL(3),
+						 buffers = PG_GETARG_BOOL(4),
+						 triggers = PG_GETARG_BOOL(5);
+		text			*format_text = PG_GETARG_TEXT_P(6);
+		ExplainFormat	 format;
+		PGPROC			*proc;
+		shm_mq_msg		*msg;
+		List			*msgs;
+		List			*backendInfo;
+
+		if (!module_initialized)
+			ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							errmsg("pg_query_state wasn't initialized yet")));
+
+		if (pid == MyProcPid)
+			ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							errmsg("attempt to extract state of current process")));
+
+		proc = BackendPidGetProc(pid);
+		if (!proc || proc->backendId == InvalidBackendId || proc->databaseId == InvalidOid || proc->roleId == InvalidOid)
+			ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							errmsg("backend with pid=%d not found", pid)));
+
+		if (TEXT_CSTR_CMP(format_text, "text") == 0)
+			format = EXPLAIN_FORMAT_TEXT;
+		else if (TEXT_CSTR_CMP(format_text, "xml") == 0)
+			format = EXPLAIN_FORMAT_XML;
+		else if (TEXT_CSTR_CMP(format_text, "json") == 0)
+			format = EXPLAIN_FORMAT_JSON;
+		else if (TEXT_CSTR_CMP(format_text, "yaml") == 0)
+			format = EXPLAIN_FORMAT_YAML;
+		else
+			ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							errmsg("unrecognized 'format' argument")));
+		/*
+		 * init and acquire lock so that any other concurrent calls of this fuction
+		 * can not occupy shared queue for transfering query state
+		 */
+		LockShmem(&tag, PG_QS_RCV_KEY);
+		check_and_init_peer(&tag, proc, 1);
+
+		backendInfo = GetRemoteBackendInfo(proc);
+		CdbPgResults cdb_pgresults = CollectQEQueryState(backendInfo);
+		AttachPeer();
+		msg =  GetRemoteBackendQueryStates(cdb_pgresults,
+									       proc,
+										   verbose,
+										   costs,
+										   timing,
+										   buffers,
+										   triggers,
+										   format);
+
+		msgs = NIL;
+		if (msg != NULL)
+		{
+			msgs = lappend(msgs, msg );
+		}
+
+		funcctx = SRF_FIRSTCALL_INIT();
+		if (msgs == NULL || list_length(msgs) == 0)
+		{
+			elog(DEBUG1, "backend does not reply");
+			UnlockShmem(&tag);
+			SRF_RETURN_DONE(funcctx);
+		}
+
+		msg = (shm_mq_msg *) linitial(msgs);
+		switch (msg->result_code)
+		{
+			case QUERY_NOT_RUNNING:
+				{
+					PgBackendStatus	*be_status = search_be_status(pid);
+
+					if (be_status)
+						elog(INFO, "state of backend is %s",
+								be_state_str[be_status->st_state - STATE_UNDEFINED]);
+					else
+						elog(INFO, "backend is not running query");
+
+					UnlockShmem(&tag);
+					SRF_RETURN_DONE(funcctx);
+				}
+			case STAT_DISABLED:
+				elog(INFO, "query execution statistics disabled");
+				UnlockShmem(&tag);
+				SRF_RETURN_DONE(funcctx);
+			case QS_RETURNED:
+				{
+					TupleDesc	tupdesc;
+					ListCell	*i;
+					int64		max_calls = 0;
+
+					/* print warnings if exist */
+					if (msg->warnings & TIMINIG_OFF_WARNING)
+						ereport(WARNING, (errcode(ERRCODE_WARNING),
+										  errmsg("timing statistics disabled")));
+					if (msg->warnings & BUFFERS_OFF_WARNING)
+						ereport(WARNING, (errcode(ERRCODE_WARNING),
+										  errmsg("buffers statistics disabled")));
+
+					oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+					/* save stack of calls and current cursor in multicall context */
+					fctx = (pg_qs_fctx *) palloc(sizeof(pg_qs_fctx));
+					fctx->procs = NIL;
+					foreach(i, msgs)
+					{
+						List 		*qs_stack;
+						shm_mq_msg	*current_msg = (shm_mq_msg *) lfirst(i);
+						proc_state	*p_state = (proc_state *) palloc(sizeof(proc_state));
+
+						if (current_msg->result_code != QS_RETURNED)
+							continue;
+
+						Assert(current_msg->result_code == QS_RETURNED);
+
+						qs_stack = deserialize_stack(current_msg->stack,
+													 current_msg->stack_depth);
+
+						p_state->proc = current_msg->proc;
+						p_state->stack = qs_stack;
+						p_state->frame_index = 0;
+						p_state->frame_cursor = list_head(qs_stack);
+
+						fctx->procs = lappend(fctx->procs, p_state);
+
+						max_calls += list_length(qs_stack);
+					}
+					fctx->proc_cursor = list_head(fctx->procs);
+
+					funcctx->user_fctx = fctx;
+					funcctx->max_calls = max_calls;
+
+					/* Make tuple descriptor */
+#if PG_VERSION_NUM < 120000
+					tupdesc = CreateTemplateTupleDesc(N_ATTRS, false);
+#else
+					tupdesc = CreateTemplateTupleDesc(N_ATTRS);
+#endif
+					TupleDescInitEntry(tupdesc, (AttrNumber) 1, "pid", INT4OID, -1, 0);
+					TupleDescInitEntry(tupdesc, (AttrNumber) 2, "frame_number", INT4OID, -1, 0);
+					TupleDescInitEntry(tupdesc, (AttrNumber) 3, "query_text", TEXTOID, -1, 0);
+					TupleDescInitEntry(tupdesc, (AttrNumber) 4, "plan", TEXTOID, -1, 0);
+					TupleDescInitEntry(tupdesc, (AttrNumber) 5, "leader_pid", INT4OID, -1, 0);
+					funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+					UnlockShmem(&tag);
+					MemoryContextSwitchTo(oldcontext);
+				}
+				break;
+		}
+	}
+
+	/* restore function multicall context */
+	funcctx = SRF_PERCALL_SETUP();
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		HeapTuple 	 tuple;
+		Datum		 values[N_ATTRS];
+		bool		 nulls[N_ATTRS];
+		proc_state	*p_state = (proc_state *) lfirst(fctx->proc_cursor);
+		stack_frame	*frame = (stack_frame *) lfirst(p_state->frame_cursor);
+
+		/* Make and return next tuple to caller */
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+		values[0] = Int32GetDatum(p_state->proc->pid);
+		values[1] = Int32GetDatum(p_state->frame_index);
+		values[2] = PointerGetDatum(frame->query);
+		values[3] = PointerGetDatum(frame->plan);
+		if (p_state->proc->pid == pid)
+			nulls[4] = true;
+		else
+			values[4] = Int32GetDatum(pid);
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+
+		/* increment cursor */
+#if PG_VERSION_NUM >= 130000
+		p_state->frame_cursor = lnext(p_state->stack, p_state->frame_cursor);
+#else
+		p_state->frame_cursor = lnext(p_state->frame_cursor);
+#endif
+		p_state->frame_index++;
+
+		if (p_state->frame_cursor == NULL)
+#if PG_VERSION_NUM >= 130000
+			fctx->proc_cursor = lnext(fctx->procs, fctx->proc_cursor);
+#else
+			fctx->proc_cursor = lnext(fctx->proc_cursor);
+#endif
+
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
+
+static void
+SendCurrentUserId(void)
+{
+	SpinLockAcquire(&counterpart_userid->mutex);
+	counterpart_userid->userid = GetUserId();
+	SpinLockRelease(&counterpart_userid->mutex);
+
+	SetLatch(counterpart_userid->caller);
+}
+
+/*
+ * Extract effective user id from backend on which `proc` points.
+ *
+ * Assume the `proc` points on valid backend and it's not current process.
+ *
+ * This fuction must be called after registration of `UserIdPollReason` and
+ * initialization `RemoteUserIdResult` object in shared memory.
+ */
+static Oid
+GetRemoteBackendUserId(PGPROC *proc)
+{
+	Oid result;
+
+	Assert(proc && proc->backendId != InvalidBackendId);
+	Assert(UserIdPollReason != INVALID_PROCSIGNAL);
+	Assert(counterpart_userid);
+
+	counterpart_userid->userid = InvalidOid;
+	counterpart_userid->caller = MyLatch;
+	pg_write_barrier();
+
+	SendProcSignal(proc->pid, UserIdPollReason, proc->backendId);
+	int count = 0;
+	int64 delay = 1000;
+	for (;;)
+	{
+		SpinLockAcquire(&counterpart_userid->mutex);
+		result = counterpart_userid->userid;
+		SpinLockRelease(&counterpart_userid->mutex);
+
+		if (result != InvalidOid || count == 3)
+			break;
+
+#if PG_VERSION_NUM < 100000
+		WaitLatch(MyLatch, WL_LATCH_SET, 0);
+#elif PG_VERSION_NUM < 120000
+		WaitLatch(MyLatch, WL_LATCH_SET, 0, PG_WAIT_EXTENSION);
+#else
+		WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, delay,
+				  PG_WAIT_EXTENSION);
+#endif
+		CHECK_FOR_INTERRUPTS();
+		ResetLatch(MyLatch);
+		count ++;
+	}
+
+	return result;
+}
+
+/*
+ * Receive a message from a shared message queue until timeout is exceeded.
+ *
+ * Parameter `*nbytes` is set to the message length and *data to point to the
+ * message payload. If timeout is exceeded SHM_MQ_WOULD_BLOCK is returned.
+ */
+shm_mq_result
+shm_mq_receive_with_timeout(shm_mq_handle *mqh,
+							Size *nbytesp,
+							void **datap,
+							int64 timeout)
+{
+	int 		rc = 0;
+	int64 		delay = timeout;
+	instr_time	start_time;
+	instr_time	cur_time;
+
+	INSTR_TIME_SET_CURRENT(start_time);
+
+	for (;;)
+	{
+		shm_mq_result mq_receive_result;
+
+		mq_receive_result = receive_msg_by_parts(mqh, nbytesp, datap, timeout, &rc, true);
+		if (mq_receive_result != SHM_MQ_WOULD_BLOCK)
+			return mq_receive_result;
+		if (rc & WL_TIMEOUT || delay <= 0)
+			return SHM_MQ_WOULD_BLOCK;
+
+#if PG_VERSION_NUM < 100000
+		rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT, delay);
+#elif PG_VERSION_NUM < 120000
+		rc = WaitLatch(MyLatch,
+					   WL_LATCH_SET | WL_TIMEOUT,
+					   delay, PG_WAIT_EXTENSION);
+#else
+		rc = WaitLatch(MyLatch,
+					   WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+					   delay, PG_WAIT_EXTENSION);
+#endif
+
+		INSTR_TIME_SET_CURRENT(cur_time);
+		INSTR_TIME_SUBTRACT(cur_time, start_time);
+
+		delay = timeout - (int64) INSTR_TIME_GET_MILLISEC(cur_time);
+		if (delay <= 0)
+			return SHM_MQ_WOULD_BLOCK;
+
+		CHECK_FOR_INTERRUPTS();
+		ResetLatch(MyLatch);
+	}
+}
+
+static shm_mq_result
+receive_msg_by_parts(shm_mq_handle *mqh, Size *total, void **datap,
+						int64 timeout, int *rc, bool nowait)
+{
+	shm_mq_result	mq_receive_result;
+	shm_mq_msg	   *buff;
+	int				offset;
+	Size		   *expected;
+	Size			expected_data;
+	Size			len;
+
+	/* Get the expected number of bytes in message */
+	mq_receive_result = shm_mq_receive(mqh, &len, (void **) &expected, nowait);
+	if (mq_receive_result != SHM_MQ_SUCCESS)
+		return mq_receive_result;
+	Assert(len == sizeof(Size));
+
+	expected_data = *expected;
+	*datap = palloc0(expected_data);
+	elog(DEBUG1, "receive data len %zu " , expected_data);
+
+	/* Get the message itself */
+	for (offset = 0; offset < expected_data; )
+	{
+		int64 delay = timeout;
+		/* Keep receiving new messages until we assemble the full message */
+		for (;;)
+		{
+			mq_receive_result = shm_mq_receive(mqh, &len, ((void **) &buff), nowait);
+			if (mq_receive_result != SHM_MQ_SUCCESS)
+			{
+				if (nowait && mq_receive_result == SHM_MQ_WOULD_BLOCK)
+				{
+					/*
+					 * We can't leave this function during reading parts with
+					 * error code SHM_MQ_WOULD_BLOCK because can be be error
+					 * at next call receive_msg_by_parts() with continuing
+					 * reading non-readed parts.
+					 * So we should wait whole MAX_RCV_TIMEOUT timeout and
+					 * return error after that only.
+					*/
+					if (delay > 0)
+					{
+						pg_usleep(PART_RCV_DELAY * 1000);
+						delay -= PART_RCV_DELAY;
+						continue;
+					}
+					if (rc)
+					{	/* Mark that the timeout has expired: */
+						*rc |= WL_TIMEOUT;
+					}
+				}
+				return mq_receive_result;
+			}
+			break;
+		}
+		memcpy((char *) *datap + offset, buff, len);
+		offset += len;
+	}
+
+	*total = offset;
+
+	return mq_receive_result;
+}
+
+void
+AttachPeer(void)
+{
+	pg_atomic_add_fetch_u32(&counterpart_userid->n_peers, 1);
+}
+
+void
+DetachPeer(void)
+{
+	int n_peers = pg_atomic_fetch_sub_u32(&counterpart_userid->n_peers, 1);
+	if (n_peers <= 0)
+		ereport(LOG, (errcode(ERRCODE_INTERNAL_ERROR),
+					  errmsg("pg_query_state peer is not responding")));
+}
+
+/*
+ * Extracts all QE worker running by process `proc`
+ */
+static List *
+GetRemoteBackendInfo(PGPROC *proc)
+{
+	int sig_result;
+	shm_mq_handle *mqh;
+	shm_mq_result mq_receive_result;
+	Size msg_len;
+	backend_info *msg;
+	int i;
+	List *result = NIL;
+
+	Assert(proc && proc->backendId != InvalidBackendId);
+	Assert(BackendInfoPollReason!= INVALID_PROCSIGNAL);
+	create_shm_mq(proc, MyProc);
+
+	sig_result = SendProcSignal(proc->pid, BackendInfoPollReason, proc->backendId);
+	if (sig_result == -1)
+		goto signal_error;
+
+	mqh = shm_mq_attach(mq, NULL, NULL);
+	mq_receive_result = shm_mq_receive(mqh, &msg_len, (void **) &msg, false);
+	if (mq_receive_result != SHM_MQ_SUCCESS || msg == NULL || msg->reqid != reqid)
+		goto mq_error;
+	if (msg->result_code == STAT_DISABLED)
+		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("query execution statistics disabled")));
+	if (msg->result_code == QUERY_NOT_RUNNING)
+		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("backend is not running query")));
+	int expect_len = BASE_SIZEOF_GP_BACKEND_INFO + msg->number * sizeof(gp_segment_pid);
+	if (msg_len != expect_len)
+		goto mq_error;
+
+	for (i = 0; i < msg->number; i++)
+	{
+		gp_segment_pid *segpid = &(msg->pids[i]);
+		elog(DEBUG1, "QE %d is running on segment %d", segpid->pid, segpid->segid);
+		result = lcons(segpid, result);
+	}
+	shm_mq_detach(mqh);
+	return result;
+
+signal_error:
+	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("invalid send signal")));
+mq_error:
+	shm_mq_detach(mqh);
+	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("backend is not running query")));
+}
+
+/*
+ * Dispatch sql SELECT cbdb_mpp_query_state(::gp_segment_pid[])
+ * to collect QE query state
+ * 
+ * return: CdbPgResults contains struct {sliceIndex, gp_command_count, CdbExplain_StatHdr}
+ * data from all QE workers
+ */
+static CdbPgResults
+CollectQEQueryState(List *backendInfo)
+{
+	ListCell		*lc;
+	int				index = 0;
+	StringInfoData params;
+	char *sql;
+	CdbPgResults cdb_pgresults = {NULL, 0};
+
+	if (list_length(backendInfo) <= 0)
+		return cdb_pgresults;
+	/* generate the below sql
+	 * SELECT cbdb_mpp_query_state((ARRAY['(0,1789)','(1,2984)'])::gp_segment_pid[]);
+	 * (0,1789) segid | pid
+	 * segment will check the segid and find those pids of
+	 * local segment to collect query state
+	 */
+	initStringInfo(&params);
+
+	foreach(lc, backendInfo)
+	{
+			index++;
+			gp_segment_pid *segpid= (gp_segment_pid *) lfirst(lc);
+			appendStringInfo(&params, "'(%d,%d)'", segpid->segid, segpid->pid );
+			if (index != list_length(backendInfo))
+			{
+				appendStringInfoChar(&params, ',');
+			}
+	}
+
+	sql = psprintf("SELECT cbdb_mpp_query_state((ARRAY[%s])::gp_segment_pid[])", params.data);
+	CdbDispatchCommand(sql, DF_NONE, &cdb_pgresults);
+	elog(DEBUG1, "SQL FOR QUERY %s, result num is %d", sql, cdb_pgresults.numResults);
+	pfree(params.data);
+	pfree(sql);
+	return cdb_pgresults;
+}
+
+/*
+ * Signal the QD which running the current query to generate
+ * the final explain result and send the cdb_pgresults to it.
+ */
+static shm_mq_msg*
+GetRemoteBackendQueryStates(CdbPgResults cdb_pgresults,
+										 PGPROC *proc,
+										 bool verbose,
+										 bool costs,
+										 bool timing,
+										 bool buffers,
+										 bool triggers,
+										 ExplainFormat format)
+{
+	int sig_result;
+	shm_mq_handle  *mqh;
+	ListCell *lc = NULL;
+	List *pgCdbStatCells = get_cdbStateCells(cdb_pgresults);
+	int resnum = list_length(pgCdbStatCells);
+
+	/* fill in parameters of query state request */
+	params->verbose = verbose;
+	params->costs = costs;
+	params->timing = timing;
+	params->buffers = buffers;
+	params->triggers = triggers;
+	params->format = format;
+	pg_write_barrier();
+	create_shm_mq(MyProc, proc);
+	elog(DEBUG1, "CREATE shm_mq sender %d, %d, sender %d", MyProc->pid, MyProcPid, proc->pid);
+
+	mqh = shm_mq_attach(mq, NULL, NULL);
+	sig_result = SendProcSignal(proc->pid,
+                                QueryStatePollReason,
+                                proc->backendId);
+	if (sig_result == -1)
+	{
+		goto signal_error;
+	}
+	// write out how many cdb_pgresults.numResults
+	if (send_msg_by_parts(mqh, sizeof(int), (void*)(&resnum)) != MSG_BY_PARTS_SUCCEEDED)
+	{
+		elog(WARNING, "pg_query_state: peer seems to have detached");
+		goto mq_error;
+	}
+	/* send the cdb_pgresults to shm_mq */
+	foreach(lc, pgCdbStatCells)
+	{
+		pgCdbStatCell *statcell = (pgCdbStatCell *)lfirst(lc);
+		if (send_msg_by_parts(mqh, statcell->len, statcell->data) != MSG_BY_PARTS_SUCCEEDED)
+		{
+			elog(WARNING, "pg_query_state: peer seems to have detached");
+			goto mq_error;
+		}
+	}
+	shm_mq_detach(mqh);
+	return receive_final_query_state();
+
+signal_error:
+	shm_mq_detach(mqh);
+	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("invalid send signal")));
+mq_error:
+	shm_mq_detach(mqh);
+	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("error in message queue data transmitting")));
+}
+
+static shm_mq_msg* 
+receive_final_query_state(void)
+{
+	shm_mq_handle  *mqh;
+	shm_mq_result mq_receive_result;
+	Size len;
+	shm_mq_msg *msg;
+	if (!wait_for_mq_ready(mq))
+			return NULL;
+	mqh = shm_mq_attach(mq, NULL, NULL);
+	mq_receive_result = shm_mq_receive_with_timeout(mqh,
+													&len,
+													(void **)&msg,
+													MAX_RCV_TIMEOUT);
+	if (!check_msg(mq_receive_result, msg, len, params->reqid))
+		goto mq_error;
+	shm_mq_detach(mqh);
+	return msg;
+mq_error:
+	shm_mq_detach(mqh);
+	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("error in message queue data transmitting")));
+}
+/* Running on QE to collect query state from slices */
+Datum
+cbdb_mpp_query_state(PG_FUNCTION_ARGS)
+{
+	ListCell *iter;
+	LOCKTAG tag;
+	shm_mq_result mq_receive_result;
+	shm_mq_handle *mqh = NULL;
+
+	/* get the {segid, pid} info of this query*/
+	List *alive_procs = get_query_backend_info(PG_GETARG_ARRAYTYPE_P(0));
+	if (alive_procs == NIL || list_length(alive_procs) <= 0)
+		PG_RETURN_NULL();
+	LockShmem(&tag, PG_QS_RCV_KEY);
+	check_and_init_peer(&tag, NULL, 0);
+	/*
+	 * collect query instrument results from all active QE backend
+	 */
+	foreach (iter, alive_procs)
+	{
+		PGPROC *proc = (PGPROC *)lfirst(iter);
+		int sig_result;
+		query_state_info *msg;
+		Size len;
+		if (proc == NULL)
+			continue;
+		/* 
+		 * Wait for shm_mq detached as the mq will be reused here,
+		 * we need to wait for the mqh->sender to detached first,
+		 * then reset the mq, otherwiase it will panic
+		 */
+		if (mqh != NULL)
+		{
+			if (!wait_for_mq_detached(mqh))
+				goto mq_error;
+		}
+		AttachPeer();
+		create_shm_mq(proc, MyProc);
+		mqh = shm_mq_attach(mq, NULL, NULL);
+		/*
+		 * send signal `QueryStatePollReason` to all processes 
+		 */
+		sig_result = SendProcSignal(proc->pid,
+									QueryStatePollReason,
+									proc->backendId);
+		if (sig_result == -1)
+		{
+			/* the gang of this sclie maybe closed*/
+			if (errno != ESRCH)
+				continue;
+
+			elog(WARNING, "failed to send signal");
+			goto signal_error;
+		}
+		mq_receive_result = shm_mq_receive_with_timeout(mqh,
+														&len,
+														(void **)&msg,
+														MAX_RCV_TIMEOUT);
+		if (!check_msg(mq_receive_result, (shm_mq_msg *) msg, len, params->reqid))
+		{
+			elog(DEBUG1, "invalid msg from %d", proc->pid);
+			goto mq_error;
+		}
+		/* 
+		 * the query of this slice maybe closed or no query running on that backend
+		 * such as create table as, some backends insert data to the table instead
+		 * of running any plan nodes.
+		 * "create table tt as select oid from pg_class;"
+		 */
+		if (msg->result_code == QUERY_NOT_RUNNING)
+		{
+			continue;
+		}
+		if (msg->result_code == STAT_DISABLED)
+		{
+			ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR),
+						errmsg("query execution statistics disabled")));
+			goto mq_error;
+		}
+		/* a little hack here, send the query_state_info as query stats to QD */
+		StringInfoData buf;
+		pq_beginmessage(&buf, 'Y');
+		appendBinaryStringInfo(&buf, (char *)msg, len);
+		pq_endmessage(&buf);
+		elog(DEBUG1, "segment %d, sliceIndex %d send query state successfully %ld ", GpIdentity.segindex, msg->sliceIndex, len + sizeof(int));
+	}
+	UnlockShmem(&tag);
+	PG_RETURN_VOID();
+signal_error:
+	DetachPeer();
+	shm_mq_detach(mqh);
+	UnlockShmem(&tag);
+	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("invalid send signal")));
+mq_error:
+	shm_mq_detach(mqh);
+	UnlockShmem(&tag);
+	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("failed to receive query state")));
+}
+
+static List*
+get_query_backend_info(ArrayType *array)
+{
+	int32 len = 0; /* the length of oid array */
+	int16       typlen;
+	int         nitems;
+	bool        typbyval;
+	char        typalign;
+	Oid         element_type = ARR_ELEMTYPE(array);
+	Datum      	*data;
+	bool		*nulls;
+	List 		*alive_procs = NIL;
+
+	get_typlenbyvalalign(element_type,
+			&typlen, &typbyval, &typalign);
+	deconstruct_array(array, element_type, typlen, typbyval,
+			 typalign, &data, &nulls,
+			 &nitems);
+
+	len = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+
+	for (int i = 0; i < len; i++)
+	{
+		HeapTupleHeader td = DatumGetHeapTupleHeader(data[i]); 
+		TupleDesc   tupDesc;
+		HeapTupleData tmptup;
+		int32 pid;
+		int32 segid;
+		bool isnull;
+		PGPROC     *proc ;
+		//FIXME: check isnull
+		tupDesc = lookup_rowtype_tupdesc_copy(HeapTupleHeaderGetTypeId(td),
+				HeapTupleHeaderGetTypMod(td));
+		tmptup.t_len = HeapTupleHeaderGetDatumLength(td);
+		tmptup.t_data = td;
+		segid = heap_getattr(&tmptup, 1, tupDesc, &isnull);
+		pid = heap_getattr(&tmptup, 2, tupDesc, &isnull);
+		proc = BackendPidGetProc(pid);
+		if (proc == NULL)
+			continue;
+		if(segid != GpIdentity.segindex)
+		{
+			continue;
+		}
+		alive_procs = lappend(alive_procs, proc);
+	}
+	return alive_procs;
+
+}
+
+bool
+check_msg(shm_mq_result mq_receive_result, shm_mq_msg *msg, Size len, int reqid)
+{
+	if (mq_receive_result != SHM_MQ_SUCCESS)
+	{
+		elog(DEBUG1, "receive the msg from the shm_mq failed: %d", mq_receive_result);
+		return false;
+	}
+	if (msg->reqid != reqid)
+	{
+		elog(WARNING, "check the msg reqid failed: msg reqid %d, reqid %d", msg->reqid, reqid);
+		return false;
+	}
+	Assert(len == msg->length);
+	return true;
+}
+
+void
+create_shm_mq(PGPROC *sender, PGPROC *receiver)
+{
+	memset(mq, 0, QUEUE_SIZE);
+    mq = shm_mq_create(mq, QUEUE_SIZE);
+    shm_mq_set_sender(mq, sender);
+    shm_mq_set_receiver(mq, receiver); /* this function notifies the
+                                        counterpart to come into data
+                                        transfer */
+}
+
+static bool
+wait_for_mq_ready(shm_mq *mq)
+{
+	/* wait until caller sets this process as sender or receiver to message queue */
+	instr_time start_time;
+	instr_time cur_time;
+	int64 delay = MAX_SND_TIMEOUT;
+	INSTR_TIME_SET_CURRENT(start_time);
+	for (;;)
+	{
+		if (shm_mq_get_receiver(mq) == MyProc)
+			break;
+		WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, delay, PG_WAIT_IPC);
+		INSTR_TIME_SET_CURRENT(cur_time);
+		INSTR_TIME_SUBTRACT(cur_time, start_time);
+
+		delay = MAX_SND_TIMEOUT - (int64)INSTR_TIME_GET_MILLISEC(cur_time);
+		if (delay <= 0)
+		{
+			elog(WARNING, "pg_query_state: failed to receive request from leader");
+			return false;
+		}
+		CHECK_FOR_INTERRUPTS();
+		ResetLatch(MyLatch);
+	}
+return true;
+}
+
+bool
+wait_for_mq_detached(shm_mq_handle *mqh)
+{
+	/* wait until caller sets this process as sender or receiver to message queue */
+	instr_time start_time;
+	instr_time cur_time;
+	int64 delay = MAX_SND_TIMEOUT;
+	INSTR_TIME_SET_CURRENT(start_time);
+	for (;;)
+	{
+		if (shm_mq_wait_for_attach(mqh) == SHM_MQ_DETACHED)
+			break;
+		WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, delay, PG_WAIT_IPC);
+		INSTR_TIME_SET_CURRENT(cur_time);
+		INSTR_TIME_SUBTRACT(cur_time, start_time);
+		delay = MAX_SND_TIMEOUT - (int64)INSTR_TIME_GET_MILLISEC(cur_time);
+		if (delay <= 0)
+		{
+			elog(WARNING, "wait for mq detached timeout");
+			return false;
+		}
+		CHECK_FOR_INTERRUPTS();
+	}
+	return true;
+}
+
+static void
+check_and_init_peer(LOCKTAG *tag, PGPROC *proc, int n_peers)
+{
+	Oid counterpart_user_id;
+	instr_time start_time;
+	instr_time cur_time;
+	INSTR_TIME_SET_CURRENT(start_time);
+	while (pg_atomic_read_u32(&counterpart_userid->n_peers) != 0)
+	{
+		pg_usleep(1000000); /* wait one second */
+		CHECK_FOR_INTERRUPTS();
+
+		INSTR_TIME_SET_CURRENT(cur_time);
+		INSTR_TIME_SUBTRACT(cur_time, start_time);
+
+		if (INSTR_TIME_GET_MILLISEC(cur_time) > MAX_RCV_TIMEOUT)
+		{
+			elog(DEBUG1, "pg_query_state: last request was interrupted");
+			/* reset the n_peers in shared memory */
+			pg_atomic_write_u32(&counterpart_userid->n_peers, 0);
+			break;
+		}
+	}
+	if (Gp_role == GP_ROLE_DISPATCH)
+	{
+		counterpart_user_id = GetRemoteBackendUserId(proc);
+		if (counterpart_user_id == InvalidOid)
+			ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+							errmsg("query is busy, no response")));	
+		if (!(superuser() || GetUserId() == counterpart_user_id))
+		{
+			UnlockShmem(tag);
+			ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+							errmsg("permission denied")));
+		}
+	}
+	pg_atomic_write_u32(&counterpart_userid->n_peers, n_peers);
+	params->reqid = ++reqid;
+	pg_write_barrier();
+}
+
+/*
+ * ExecutorEnd hook: log results if needed
+ */
+static void
+qs_ExecutorEnd(QueryDesc *queryDesc)
+{
+	if (pg_qs_enable && is_querystack_empty() && filter_running_query(queryDesc))
+	{
+		qs_query *query = push_query(queryDesc);
+		PG_TRY();
+		{
+			if (Gp_role == GP_ROLE_EXECUTE && enable_qs_runtime() &&
+				(query->queryDesc->instrument_options | INSTRUMENT_ROWS))
+			{
+				StringInfo strInfo = cdbexplain_getExecStats_runtime(queryDesc);
+				if (strInfo != NULL)
+					set_CachedQueryStateInfo(LocallyExecutingSliceIndex(queryDesc->estate), strInfo,
+											 gp_command_count, query->id);
+			} else {
+				qs_print_plan(query);
+			}
+			pop_query();
+		}
+		PG_CATCH();
+		{
+			pop_query();
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+	}
+	if (prev_ExecutorEnd)
+		prev_ExecutorEnd(queryDesc);
+	else
+		standard_ExecutorEnd(queryDesc);
+}
+static void
+qs_print_plan(qs_query *query)
+{
+	MemoryContext oldcxt;
+	QueryDesc *queryDesc = query->queryDesc;
+	double msec;
+	ErrorData *qeError = NULL;
+	if (Gp_role == GP_ROLE_DISPATCH && queryDesc->totaltime && queryDesc->showstatctx && enable_qs_runtime())
+	{
+		if (queryDesc->estate->dispatcherState &&
+			queryDesc->estate->dispatcherState->primaryResults)
+		{
+			EState *estate = queryDesc->estate;
+			DispatchWaitMode waitMode = DISPATCH_WAIT_NONE;
+			if (!estate->es_got_eos)
+			{
+				ExecSquelchNode(queryDesc->planstate, true);
+			}
+
+			/*
+			 * Wait for completion of all QEs.  We send a "graceful" query
+			 * finish, not cancel signal.  Since the query has succeeded,
+			 * don't confuse QEs by sending erroneous message.
+			 */
+			if (estate->cancelUnfinished)
+				waitMode = DISPATCH_WAIT_FINISH;
+
+			cdbdisp_checkDispatchResult(queryDesc->estate->dispatcherState, DISPATCH_WAIT_NONE);
+			cdbdisp_getDispatchResults(queryDesc->estate->dispatcherState, &qeError);
+		}
+		if (!qeError)
+		{
+			/*
+			 * Make sure we operate in the per-query context, so any cruft will be
+			 * discarded later during ExecutorEnd.
+			 */
+			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+
+			/*
+			 * Make sure stats accumulation is done.  (Note: it's okay if several
+			 * levels of hook all do this.)
+			 */
+			InstrEndLoop(queryDesc->totaltime);
+			/* Log plan if duration is exceeded. */
+			msec = queryDesc->totaltime->total;
+			if (msec >= 0)
+			{
+				ExplainState *es = NewExplainState();
+				es->analyze = true; 
+				es->verbose = false;
+				es->buffers = false;
+				es->wal = false;
+				es->timing = true;
+				es->summary = false;
+				es->format = EXPLAIN_FORMAT_JSON;
+				es->settings = true;
+				ExplainBeginOutput(es);
+				ExplainQueryText(es, queryDesc);
+				ExplainPrintPlan(es, queryDesc);
+				if (es->costs)
+					ExplainPrintJITSummary(es, queryDesc);
+				if (es->analyze)
+					ExplainPrintExecStatsEnd(es, queryDesc);
+				ExplainEndOutput(es);
+
+				/* Remove last line break */
+				if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
+					es->str->data[--es->str->len] = '\0';
+
+				es->str->data[0] = '{';
+				es->str->data[es->str->len - 1] = '}';
+
+				/* save the qd query state, set the sliceId to be 0, it will be sent to gpsmon */
+				set_CachedQueryStateInfo(0, es->str, gp_command_count, query->id);
+			}
+			MemoryContextSwitchTo(oldcxt);
+		}
+	}
+}
+
+static void
+clear_queryStateInfo(void)
+{
+	/*
+	 * Don't process any signal when reseting the CachedQueryStateInfo
+	 * so that will not leed to contention on this var
+	 */
+	HOLD_INTERRUPTS();
+	if (CachedQueryStateInfo != NULL)
+	{
+		pfree(CachedQueryStateInfo);
+		CachedQueryStateInfo = NULL;
+	}
+	RESUME_INTERRUPTS();
+}
+
+static void
+set_CachedQueryStateInfo(int sliceIndex, StringInfo strInfo, int gp_command_count, int queryId)
+{
+	HOLD_INTERRUPTS();
+	if (queryStateCtx == NULL)
+	{
+		queryStateCtx = AllocSetContextCreate(TopMemoryContext,
+											  "save_query_state_cxt",
+											  ALLOCSET_DEFAULT_SIZES);
+	}
+	if (CachedQueryStateInfo != NULL)
+		clear_queryStateInfo();
+	MemoryContext queryContext = MemoryContextSwitchTo(queryStateCtx);
+	CachedQueryStateInfo = new_queryStateInfo(sliceIndex, strInfo,gp_command_count , queryId,  QS_RETURNED);
+	MemoryContextSwitchTo(queryContext);
+	RESUME_INTERRUPTS();
+}
+query_state_info*
+new_queryStateInfo(int sliceIndex, StringInfo strInfo, int reqid, int queryId, PG_QS_RequestResult result_code)
+{
+	/* The strInfo->data[len] is \0, we need it to be included in the length */
+	int dataLen = strInfo->len + 1;
+	/*
+	 * Don't process any signal when setting the CachedQueryStateInfo
+	 * so that will not leed to contention on this var
+	 */
+	query_state_info *info = (query_state_info *)palloc0(dataLen + sizeof(query_state_info));
+	info->sliceIndex = sliceIndex;
+	info->gp_command_count = gp_command_count;
+	info->queryId = queryId;
+	info->length = strInfo->len + sizeof(query_state_info);
+	info->reqid = reqid;
+	info->proc = MyProc;
+	info->result_code = result_code;
+	memcpy(info->data, strInfo->data, dataLen);
+	pfree(strInfo);
+	return info;
+}
+
+static bool
+filter_query_common(QueryDesc *queryDesc)
+{
+	if (queryDesc == NULL)
+		return false;
+	if (queryDesc->extended_query)
+		return false;
+	return (queryDesc->operation == CMD_SELECT || queryDesc->operation == CMD_DELETE ||
+			queryDesc->operation == CMD_INSERT || queryDesc->operation == CMD_UPDATE);
+}
+bool filter_running_query(QueryDesc *queryDesc)
+{
+	if (!filter_query_common(queryDesc))
+		return false;
+	if (!queryDesc->instrument_options)
+		return false;
+	if (!queryDesc->instrument_options)
+		return false;
+	if ((queryDesc->instrument_options & INSTRUMENT_ROWS) == 0)
+		return false;
+	return true;
+}
+
+bool
+enable_qs_runtime(void)
+{
+	if (!pg_qs_enable)
+		return false;
+	return pg_atomic_read_u32(pg_qs_on);
+}
+
+/* check and count the cbd_pgresults */
+static List *
+get_cdbStateCells(CdbPgResults cdb_pgresults)
+{
+	List *pgCdbStatCells = NIL;
+	for (int i = 0; i < cdb_pgresults.numResults; i++)
+	{
+		PGresult *pgresult = cdb_pgresults.pg_results[i];
+
+		if (PQresultStatus(pgresult) != PGRES_TUPLES_OK)
+		{
+			cdbdisp_clearCdbPgResults(&cdb_pgresults);
+			elog(ERROR, "cdbRelMaxSegSize: resultStatus not tuples_Ok: %s %s",
+				 PQresStatus(PQresultStatus(pgresult)), PQresultErrorMessage(pgresult));
+		}
+		else
+		{
+			pgCdbStatCell *statcell;
+			/* Find our statistics in list of response messages.  If none, skip. */
+			/* FIXME: only one statecell per pgresult?*/
+			for (statcell = pgresult->cdbstats; statcell; statcell = statcell->next)
+			{
+				if (!statcell)
+				{
+					/* should detach the mq as the cdb_pgresults.numResults sent to mq is not correct*/
+					elog(WARNING, "invalid statecell");
+					return NIL;
+				}
+				query_state_info *state = (query_state_info *)statcell->data;
+				if (!IsA((Node *)(state->data), CdbExplain_StatHdr))
+				{
+					elog(WARNING, "not a statecell");
+					continue;
+				}
+				pgCdbStatCells = lappend(pgCdbStatCells, statcell);
+			}
+		}
+	}
+	return pgCdbStatCells;
+}
+
+Datum
+query_state_pause(PG_FUNCTION_ARGS)
+{
+	if (!IS_QUERY_DISPATCHER())
+		ereport(ERROR, (errcode(ERRCODE_GP_FEATURE_NOT_YET), errmsg("Only can be called on coordinator")));
+	if (!superuser())
+	{
+		ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to pause query state")));
+	}
+	char *sql = psprintf("SELECT query_state_pause_command()");
+	CdbDispatchCommand(sql, DF_NONE, NULL);
+	pfree(sql);
+	pg_atomic_write_u32(pg_qs_on, 0);
+	PG_RETURN_NULL();
+}
+
+Datum
+query_state_resume(PG_FUNCTION_ARGS)
+{
+	if (!IS_QUERY_DISPATCHER())
+		ereport(ERROR, (errcode(ERRCODE_GP_FEATURE_NOT_YET), errmsg("Only can be called on coordinator")));
+	if (!superuser())
+	{
+		ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to pause query state")));
+	}
+	char *sql = psprintf("SELECT query_state_resume_command()");
+	CdbDispatchCommand(sql, DF_NONE, NULL);
+	pfree(sql);
+	pg_atomic_write_u32(pg_qs_on, 1);
+	PG_RETURN_NULL();
+}
+Datum
+query_state_pause_command(PG_FUNCTION_ARGS)
+{
+	if (!superuser())
+	{
+		ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to pause query state")));
+	}
+
+	pg_atomic_write_u32(pg_qs_on, 0);
+	PG_RETURN_NULL();
+}
+
+
+Datum
+query_state_resume_command(PG_FUNCTION_ARGS)
+{
+	if (!superuser())
+	{
+		ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to pause query state")));
+	}
+	pg_atomic_write_u32(pg_qs_on, 1);
+	PG_RETURN_NULL();
+}
+
+static qs_query*
+push_query(QueryDesc *queryDesc)
+{
+	qs_query *query = (qs_query *) palloc0(sizeof(qs_query));
+	query->id = list_length(QueryDescStack) + 1;
+	query->queryDesc = queryDesc;
+	QueryDescStack = lcons(query, QueryDescStack);
+	return query;
+}
+
+static void
+pop_query(void)
+{
+	QueryDescStack = list_delete_first(QueryDescStack);
+}
+
+bool
+is_querystack_empty(void)
+{
+	return list_length(QueryDescStack) == 0;
+}
+
+qs_query*
+get_query(void)
+{
+	return QueryDescStack == NIL ? NULL : (qs_query *)llast(QueryDescStack);
+}
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.h b/contrib/perfmon/src/gpmon/pg_query_state.h
new file mode 100644
index 00000000000..1e9bcfc2ee8
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/pg_query_state.h
@@ -0,0 +1,173 @@
+/*
+ * pg_query_state.h
+ *		Headers for pg_query_state extension.
+ *
+ * Copyright (c) 2016-2024, Postgres Professional
+ *
+ * IDENTIFICATION
+ *	  contrib/pg_query_state/pg_query_state.h
+ */
+#ifndef __PG_QUERY_STATE_H__
+#define __PG_QUERY_STATE_H__
+
+#include <postgres.h>
+
+#include "commands/explain.h"
+#include "nodes/pg_list.h"
+#include "storage/procarray.h"
+#include "storage/shm_mq.h"
+
+
+#define	QUEUE_SIZE			(16 * 1024)
+#define MSG_MAX_SIZE		1024
+#define WRITING_DELAY		(100 * 1000) /* 100ms */
+#define NUM_OF_ATTEMPTS		6
+
+#define TIMINIG_OFF_WARNING 1
+#define BUFFERS_OFF_WARNING 2
+
+#define	PG_QS_MODULE_KEY	0xCA94B108
+#define	PG_QS_RCV_KEY       0
+#define	PG_QS_SND_KEY       1
+
+/* Receive timeout should be larger than send timeout to let workers stop waiting before polling process */
+#define MAX_RCV_TIMEOUT   6000 /* 6 seconds */
+#define MAX_SND_TIMEOUT   3000 /* 3 seconds */
+
+/*
+ * Delay for receiving parts of full message (in case SHM_MQ_WOULD_BLOCK code),
+ * should be tess than MAX_RCV_TIMEOUT
+ */
+#define PART_RCV_DELAY    1000 /* 1 second */
+
+/*
+ * Result status on query state request from asked backend
+ */
+typedef enum
+{
+	QUERY_NOT_RUNNING,		/* Backend doesn't execute any query */
+	STAT_DISABLED,			/* Collection of execution statistics is disabled */
+	QS_RETURNED			/* Backend succx[esfully returned its query state */
+} PG_QS_RequestResult;
+
+/*
+ *	Format of transmited data through message queue
+ */
+typedef struct
+{
+	int     reqid;
+	int		length;							/* size of message record, for sanity check */
+	PGPROC	*proc;
+	PG_QS_RequestResult	result_code;
+	int warnings;
+	int		stack_depth;
+	char	stack[FLEXIBLE_ARRAY_MEMBER];	/* sequencially laid out stack frames in form of
+												text records */
+} shm_mq_msg;
+
+#define BASE_SIZEOF_SHM_MQ_MSG (offsetof(shm_mq_msg, stack_depth))
+
+typedef struct
+{
+	int32 	segid;
+	int32 	pid;
+} gp_segment_pid;
+
+/*
+ *	Format of transmited data gp_backend_info through message queue
+ */
+typedef struct
+{
+	int     reqid;
+	int		length;							/* size of message record, for sanity check */
+	PGPROC	*proc;
+	PG_QS_RequestResult	result_code;
+	int		number;
+	gp_segment_pid pids[FLEXIBLE_ARRAY_MEMBER];
+} backend_info;
+
+typedef struct 
+{
+	int     reqid;
+	int		length;							/* size of message record, for sanity check */
+	PGPROC	*proc;
+	PG_QS_RequestResult	result_code;
+	int 	sliceIndex;
+	int 	gp_command_count;
+	int 	queryId;
+	/* data saves the CdbExplain_StatHdr */
+	char 	data[FLEXIBLE_ARRAY_MEMBER];
+} query_state_info;
+
+
+#define BASE_SIZEOF_GP_BACKEND_INFO (offsetof(backend_info, pids))
+/* pg_query_state arguments */
+typedef struct
+{
+	int     reqid;
+	bool 	verbose;
+	bool	costs;
+	bool	timing;
+	bool	buffers;
+	bool	triggers;
+	ExplainFormat format;
+} pg_qs_params;
+
+typedef struct 
+{
+	QueryDesc *queryDesc;
+	int id;
+} qs_query;
+
+
+/* moved from signal_handler.c*/
+/*
+ * An self-explanarory enum describing the send_msg_by_parts results
+ */
+typedef enum
+{
+	MSG_BY_PARTS_SUCCEEDED,
+	MSG_BY_PARTS_FAILED
+} msg_by_parts_result;
+
+
+/* pg_query_state */
+extern bool 	pg_qs_enable;
+extern bool 	pg_qs_timing;
+extern bool 	pg_qs_buffers;
+extern List 	*QueryDescStack;
+extern pg_qs_params *params;
+extern shm_mq 	*mq;
+
+extern query_state_info *CachedQueryStateInfo; 
+
+/* pg_query_setat.c */
+extern shm_mq_result
+shm_mq_receive_with_timeout(shm_mq_handle *mqh,
+							Size *nbytesp,
+							void **datap,
+							int64 timeout);
+extern bool enable_qs_runtime(void);
+extern bool enable_qs_done(void);
+
+
+/* signal_handler.c */
+extern void SendQueryState(void);
+extern void SendCdbComponents(void);
+extern void DetachPeer(void);
+extern void AttachPeer(void);
+extern void UnlockShmem(LOCKTAG *tag);
+extern void LockShmem(LOCKTAG *tag, uint32 key);
+extern void init_pg_query_state(void);
+extern msg_by_parts_result send_msg_by_parts(shm_mq_handle *mqh, Size nbytes, const void *data);
+
+extern bool check_msg(shm_mq_result mq_receive_result, shm_mq_msg *msg, Size len, int reqid);
+extern void create_shm_mq(PGPROC *sender, PGPROC *receiver);
+extern bool filter_running_query(QueryDesc *queryDesc);
+extern query_state_info *new_queryStateInfo(int sliceIndex, StringInfo strInfo, int reqid,
+											int queryId,
+											PG_QS_RequestResult result_code);
+extern bool wait_for_mq_detached(shm_mq_handle *mqh);
+extern bool is_querystack_empty(void);
+extern qs_query *get_query(void);
+#endif
diff --git a/contrib/perfmon/src/gpmon/signal_handler.c b/contrib/perfmon/src/gpmon/signal_handler.c
new file mode 100644
index 00000000000..36a334878f3
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/signal_handler.c
@@ -0,0 +1,778 @@
+/*
+ * signal_handler.c
+ *		Collect current query state and send it to requestor in custom signal handler
+ *
+ * Copyright (c) 2016-2024, Postgres Professional
+ *
+ * IDENTIFICATION
+ *	  contrib/pg_query_state/signal_handler.c
+ */
+
+#include "pg_query_state.h"
+#include "libpq-fe.h"
+
+#include "cdb/cdbexplain.h"
+#include "cdb/cdbutil.h"
+#include "cdb/cdbvars.h"
+#include "cdb/cdbconn.h"
+#include "cdb/cdbdispatchresult.h"
+#include "commands/explain.h"
+#include "miscadmin.h"
+#if PG_VERSION_NUM >= 100000
+#include "pgstat.h"
+#endif
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "libpq-int.h"
+#include "libpq/pqmq.h"
+#include "storage/lock.h"
+/*
+ * Structure of stack frame of fucntion call which resulted from analyze of query state
+ */
+typedef struct
+{
+	const char	*query;
+	char		*plan;
+} stack_frame;
+
+
+msg_by_parts_result send_msg_by_parts(shm_mq_handle *mqh, Size nbytes, const void *data);
+
+static bool QE_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc);
+static bool QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc);
+static CdbDispatchResults* makeDispatchResults(SliceTable *table);
+static bool 
+query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg);
+static bool
+send_cdbComponents_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg);
+static bool 
+receive_QE_query_state(shm_mq_handle *mqh, List **pgresults, int queryId);
+static bool
+process_qe_query_state(CdbDispatcherState **disp_state, List *pgresults);
+static void 
+fill_segpid(CdbComponentDatabaseInfo *segInfo ,backend_info *msg, int *index);
+/*
+ * Compute length of serialized stack frame
+ */
+static int
+serialized_stack_frame_length(stack_frame *qs_frame)
+{
+	return 	INTALIGN(strlen(qs_frame->query) + VARHDRSZ)
+		+ 	INTALIGN(strlen(qs_frame->plan) + VARHDRSZ);
+}
+
+/*
+ * Compute overall length of serialized stack of function calls
+ */
+static int
+serialized_stack_length(List *qs_stack)
+{
+	ListCell 	*i;
+	int			result = 0;
+
+	foreach(i, qs_stack)
+	{
+		stack_frame *qs_frame = (stack_frame *) lfirst(i);
+
+		result += serialized_stack_frame_length(qs_frame);
+	}
+
+	return result;
+}
+
+/*
+ * Convert stack_frame record into serialized text format version
+ * 		Increment '*dest' pointer to the next serialized stack frame
+ */
+static void
+serialize_stack_frame(char **dest, stack_frame *qs_frame)
+{
+	SET_VARSIZE(*dest, strlen(qs_frame->query) + VARHDRSZ);
+	memcpy(VARDATA(*dest), qs_frame->query, strlen(qs_frame->query));
+	*dest += INTALIGN(VARSIZE(*dest));
+
+	SET_VARSIZE(*dest, strlen(qs_frame->plan) + VARHDRSZ);
+	memcpy(VARDATA(*dest), qs_frame->plan, strlen(qs_frame->plan));
+	*dest += INTALIGN(VARSIZE(*dest));
+}
+
+/*
+ * Convert List of stack_frame records into serialized structures laid out sequentially
+ */
+static void
+serialize_stack(char *dest, List *qs_stack)
+{
+	ListCell		*i;
+
+	foreach(i, qs_stack)
+	{
+		stack_frame *qs_frame = (stack_frame *) lfirst(i);
+
+		serialize_stack_frame(&dest, qs_frame);
+	}
+}
+
+static msg_by_parts_result
+shm_mq_send_nonblocking(shm_mq_handle *mqh, Size nbytes, const void *data, Size attempts)
+{
+	int				i;
+	shm_mq_result	res;
+
+	for(i = 0; i < attempts; i++)
+	{
+#if PG_VERSION_NUM < 150000
+		res = shm_mq_send(mqh, nbytes, data, true);
+#else
+		res = shm_mq_send(mqh, nbytes, data, true, true);
+#endif
+
+		if(res == SHM_MQ_SUCCESS)
+			break;
+		else if (res == SHM_MQ_DETACHED)
+			return MSG_BY_PARTS_FAILED;
+
+		/* SHM_MQ_WOULD_BLOCK - sleeping for some delay */
+		pg_usleep(WRITING_DELAY);
+	}
+
+	if(i == attempts)
+		return MSG_BY_PARTS_FAILED;
+
+	return MSG_BY_PARTS_SUCCEEDED;
+}
+
+/*
+ * send_msg_by_parts sends data through the queue as a bunch of messages
+ * of smaller size
+ */
+msg_by_parts_result
+send_msg_by_parts(shm_mq_handle *mqh, Size nbytes, const void *data)
+{
+	int bytes_left;
+	int bytes_send;
+	int offset;
+
+	/* Send the expected message length */
+	if(shm_mq_send_nonblocking(mqh, sizeof(Size), &nbytes, NUM_OF_ATTEMPTS) == MSG_BY_PARTS_FAILED)
+		return MSG_BY_PARTS_FAILED;
+
+	/* Send the message itself */
+	for (offset = 0; offset < nbytes; offset += bytes_send)
+	{
+		bytes_left = nbytes - offset;
+		bytes_send = (bytes_left < MSG_MAX_SIZE) ? bytes_left : MSG_MAX_SIZE;
+		if(shm_mq_send_nonblocking(mqh, bytes_send, &(((unsigned char*)data)[offset]), NUM_OF_ATTEMPTS)
+			== MSG_BY_PARTS_FAILED)
+			return MSG_BY_PARTS_FAILED;
+	}
+
+	return MSG_BY_PARTS_SUCCEEDED;
+}
+
+/*
+ * Send state of current query to shared queue.
+ * This function is called when fire custom signal QueryStatePollReason
+ */
+void
+SendQueryState(void)
+{
+	shm_mq_handle  *mqh;
+	int         	reqid = params->reqid;
+	MemoryContext	oldctx;
+	bool 			success = true;
+
+	MemoryContext query_state_ctx = AllocSetContextCreate(TopMemoryContext,
+														  "pg_query_state",
+														  ALLOCSET_DEFAULT_SIZES);
+	oldctx = MemoryContextSwitchTo(query_state_ctx);
+	elog(DEBUG1, "Worker %d receives pg_query_state request from %d", shm_mq_get_sender(mq)->pid, shm_mq_get_receiver(mq)->pid);
+
+	PG_TRY();
+	{
+		mqh = shm_mq_attach(mq, NULL, NULL);
+
+		/* happy path */
+		elog(DEBUG1, "happy path");
+		if (Gp_role == GP_ROLE_DISPATCH)
+		{
+			if (reqid != params->reqid || shm_mq_get_receiver(mq) != MyProc)
+			{
+				success = false;
+			}
+			else if (!QD_SendQueryState(mqh,  MyProc))
+				 success = false;
+		}
+		else
+		{
+			if (reqid != params->reqid || shm_mq_get_sender(mq) != MyProc)
+			{
+				success = false;
+			}
+			else if (!QE_SendQueryState(mqh, MyProc))
+				success = false;
+		}
+	}
+	PG_CATCH();
+	{
+		MemoryContextSwitchTo(oldctx);
+		elog(WARNING, "Failed to send query state");
+		elog_dismiss(WARNING);
+		success = false;
+	}
+	PG_END_TRY();
+	shm_mq_detach(mqh);
+	DetachPeer();
+	MemoryContextSwitchTo(oldctx);
+	MemoryContextDelete(query_state_ctx);
+	return;
+}
+
+/* Added by cbdb
+*  SendCdbComponents sends the array of gp_segment_pid info
+*  of current session to shm_mq
+*  Only called on QD
+*/
+void
+SendCdbComponents(void)
+{
+	shm_mq_handle *mqh;
+	int reqid = params->reqid;
+	shm_mq_result result;
+	CdbComponentDatabases *cdbs;
+	shm_mq_msg *pre_check_msg;
+	MemoryContext oldctx;
+	bool success = true;
+	int index = 0;
+	MemoryContext query_state_ctx = AllocSetContextCreate(TopMemoryContext,
+														  "pg_query_state",
+														  ALLOCSET_DEFAULT_SIZES);
+	oldctx = MemoryContextSwitchTo(query_state_ctx);
+	pre_check_msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
+	PG_TRY();
+	{
+		mqh = shm_mq_attach(mq, NULL, NULL);
+		if (!send_cdbComponents_pre_check(mqh, params->reqid, pre_check_msg))
+		{
+			success = false;
+			shm_mq_send(mqh, pre_check_msg->length, pre_check_msg, false);
+		}
+		else
+		{
+			cdbs = cdbcomponent_getCdbComponents();
+			/* compute the size of the msg
+			 * as the struct gp_segment_pid only contains two int fields,
+			 * so not calling INTALIGN here.
+			 */
+			int msglen = BASE_SIZEOF_GP_BACKEND_INFO + sizeof(gp_segment_pid) * cdbs->numActiveQEs;
+			backend_info *msg = (backend_info *)palloc0(msglen);
+			/* index for backend_info.pids array */
+			msg->reqid = reqid;
+			/* FIXME: add another code for it  */
+			msg->result_code = QS_RETURNED;
+			/* Fill the QE pid */
+			for (int i = 0; i < cdbs->total_segment_dbs; i++)
+			{
+				CdbComponentDatabaseInfo *segInfo = &cdbs->segment_db_info[i];
+				fill_segpid(segInfo, msg, &index);
+			}
+			/* Fill the entryDB pid */
+			for (int i = 0; i < cdbs->total_entry_dbs; i++)
+			{
+				CdbComponentDatabaseInfo *segInfo = &cdbs->entry_db_info[i];
+				fill_segpid(segInfo, msg, &index);
+			}
+			Assert(index == cdbs->numActiveQEs);
+			msg->number = index;
+#if PG_VERSION_NUM < 150000
+			result = shm_mq_send(mqh, msglen, msg, false);
+#else
+			result = shm_mq_send(mqh, msglen, msg, false, true);
+#endif
+			/* Check for failure. */
+			if (result != SHM_MQ_SUCCESS){
+				shm_mq_detach(mqh);
+				success = false;
+			}
+		}
+	}
+	PG_CATCH();
+	{
+		MemoryContextSwitchTo(oldctx);
+		elog(WARNING, " SendCdbComponents failed");
+		elog_dismiss(WARNING);
+		success = false;
+		shm_mq_detach(mqh);
+	}
+	PG_END_TRY();
+	DetachPeer();
+	MemoryContextSwitchTo(oldctx);
+	MemoryContextDelete(query_state_ctx);
+	if (success)
+		elog(DEBUG1, "Worker %d sends response for SendCdbComponents to %d", shm_mq_get_sender(mq)->pid, shm_mq_get_receiver(mq)->pid);
+	return;
+}
+
+static bool 
+QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
+{
+	QueryDesc *queryDesc;
+	ExplainState *es;
+	List *result = NIL;
+	shm_mq_msg *msg;
+	CdbDispatcherState *disp_state = NULL;
+	instr_time starttime;
+	List *qs_stack = NIL;
+	LOCKTAG			 tag;
+	bool success = true;
+	PGPROC *sender;
+	List *pgresults = NIL;
+	shm_mq_msg *pre_check_msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
+	qs_query *query = get_query();
+	int queryId = query == NULL? -1 : query->id;
+
+	/* first receive the results, it may be empty, such as functions only run on master */
+	if (!receive_QE_query_state(mqh, &pgresults, queryId))
+		return false;
+	queryDesc = query == NULL? NULL: query->queryDesc;
+	if (!process_qe_query_state(&disp_state, pgresults))
+		return false;
+	sender = shm_mq_get_sender(mq);
+	if (!wait_for_mq_detached(mqh))
+		return false;
+	/* recreate shm_mq, switch the sender and receiver*/
+	LockShmem(&tag, PG_QS_SND_KEY);
+	create_shm_mq(MyProc, sender);
+	elog(DEBUG1, "switch sender and receiver  receiver %d, sender  %d",sender->pid, MyProc->pid);
+	mqh = shm_mq_attach(mq, NULL, NULL);
+	if (!query_state_pre_check(mqh, params->reqid, pre_check_msg))
+	{
+		int sendRes = send_msg_by_parts(mqh, pre_check_msg->length, pre_check_msg);
+		UnlockShmem(&tag);
+		pfree(pre_check_msg);
+		if (sendRes != MSG_BY_PARTS_SUCCEEDED)
+		{
+			elog(DEBUG1, "send cannot send query state proc %d failed", proc->pid);
+			return false;
+		}
+		return true;
+	}
+	Assert(queryDesc);
+
+	/*
+	 * Save the old dispatcher state of estate.
+	 * If the analyze of the query is true, the old_disp_state is not null,
+	 * we need to restore it.
+	 */
+	CdbDispatcherState *old_disp_state = queryDesc->estate->dispatcherState;
+	struct CdbExplain_ShowStatCtx *oldShowstatctx = queryDesc->showstatctx;
+	PG_TRY();
+	{
+		/* initialize explain state with all config parameters */
+		es = NewExplainState();
+		es->analyze = true;
+		es->verbose = params->verbose;
+		es->costs = params->costs;
+		es->buffers = params->buffers && pg_qs_buffers;
+		es->timing = params->timing && pg_qs_timing;
+		es->summary = false;
+		es->format = params->format;
+		es->runtime = true;
+		INSTR_TIME_SET_CURRENT(starttime);
+		es->showstatctx = cdbexplain_showExecStatsBegin(queryDesc,
+														starttime);
+		/* push the DispatchState into queryDesc->estate */
+		queryDesc->estate->dispatcherState = disp_state;
+		queryDesc->showstatctx = es->showstatctx;
+
+		initStringInfo(es->str);
+		ExplainBeginOutput(es);
+		ExplainQueryText(es, queryDesc);
+		ExplainPrintPlan(es, queryDesc);
+		if (es->costs)
+			ExplainPrintJITSummary(es, queryDesc);
+		if (es->analyze)
+			ExplainPrintExecStatsEnd(es, queryDesc);
+		ExplainEndOutput(es);
+
+		/* reset the dispatcherState in estate*/
+		queryDesc->estate->dispatcherState = old_disp_state;
+		queryDesc->showstatctx = oldShowstatctx;
+
+		/* Remove last line break */
+		if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
+			es->str->data[--es->str->len] = '\0';
+
+		/* Fix JSON to output an object */
+		if (params->format == EXPLAIN_FORMAT_JSON)
+		{
+			es->str->data[0] = '{';
+			es->str->data[es->str->len - 1] = '}';
+		}
+		stack_frame *qs_frame = palloc0(sizeof(stack_frame));
+
+		qs_frame->plan = es->str->data;
+		qs_frame->query = queryDesc->sourceText;
+
+		qs_stack = lcons(qs_frame, result);
+		success= true;
+	}
+	PG_CATCH();
+	{
+		UnlockShmem(&tag);
+		elog(WARNING, " SendQueryState failed");
+		/* reset the queryDesc->estate */
+		queryDesc->estate->dispatcherState = old_disp_state;
+		queryDesc->showstatctx = oldShowstatctx;
+		elog_dismiss(WARNING);
+		success = false;
+	}
+	PG_END_TRY();
+	if (!success)
+		return success;
+
+	/* send result to pg_query_state process */
+	int msglen = sizeof(shm_mq_msg) + serialized_stack_length(qs_stack);
+	msg = palloc(msglen);
+
+	msg->reqid = params->reqid;
+	msg->length = msglen;
+	msg->proc = MyProc;
+	msg->result_code = QS_RETURNED;
+
+	msg->warnings = 0;
+	if (params->timing && !pg_qs_timing)
+		msg->warnings |= TIMINIG_OFF_WARNING;
+	if (params->buffers && !pg_qs_buffers)
+		msg->warnings |= BUFFERS_OFF_WARNING;
+
+	msg->stack_depth = list_length(qs_stack);
+	serialize_stack(msg->stack, qs_stack);
+
+	if (send_msg_by_parts(mqh, msglen, msg) != MSG_BY_PARTS_SUCCEEDED)
+	{
+		elog(WARNING, "pg_query_state: peer seems to have detached");
+			UnlockShmem(&tag);
+		return false;
+	}
+	elog(DEBUG1, "Worker %d sends response for pg_query_state to %d", shm_mq_get_sender(mq)->pid, shm_mq_get_receiver(mq)->pid);
+		UnlockShmem(&tag);
+	return true;
+}
+
+/*
+ * Send plan with instrument to the shm_mq
+ *
+ * The data is format as below
+ * First message:
+ * Msg length | reqid | proc | result_code | warnings | stack_depth | sliceIndex
+ * Second message:
+ * pq_msgtype('Y') | CdbExplain_StatHdr
+ */
+static bool 
+QE_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
+{
+	//QueryDesc *queryDesc;
+	qs_query *query;
+	int sliceIndex;
+	query_state_info *info;
+	shm_mq_msg *pre_check_msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
+	bool success = true;
+	/* cannot use the send_msg_by_parts here */
+	if (!query_state_pre_check(mqh, params->reqid, pre_check_msg))
+	{
+		int sendRes = send_msg_by_parts(mqh, pre_check_msg->length, pre_check_msg);
+		pfree(pre_check_msg);
+		if (sendRes != MSG_BY_PARTS_SUCCEEDED)
+		{
+			elog(WARNING, "send cannot send query state proc %d failed", proc->pid);
+			return false;
+		}
+		return true;
+	}
+	PG_TRY();
+	{
+	
+		if (is_querystack_empty())
+		{
+			if (CachedQueryStateInfo == NULL)
+				success = false;
+			else
+			{
+
+				int dataLen = 0;
+				info = (query_state_info *)palloc0(CachedQueryStateInfo->length);
+				info->length = CachedQueryStateInfo->length;
+				dataLen = CachedQueryStateInfo->length - sizeof(query_state_info);
+				info->sliceIndex = CachedQueryStateInfo->sliceIndex;
+				info->gp_command_count = CachedQueryStateInfo->gp_command_count;
+				memcpy(info->data, CachedQueryStateInfo->data, dataLen);
+				info->reqid = params->reqid;
+				info->proc = MyProc;
+				info->result_code = QS_RETURNED;
+			}
+		}
+		else {
+			query = get_query();
+			Assert(query && query->queryDesc);
+			StringInfo strInfo = cdbexplain_getExecStats_runtime(query->queryDesc);
+			if (strInfo == NULL)
+				ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("cannot get runtime stats")));
+			sliceIndex = LocallyExecutingSliceIndex(query->queryDesc->estate);
+			info = new_queryStateInfo(sliceIndex, strInfo, params->reqid, query->id, QS_RETURNED);
+		}
+		msg_by_parts_result sendResult = send_msg_by_parts(mqh, info->length, info);
+		pfree(info);
+		if (sendResult != MSG_BY_PARTS_SUCCEEDED)
+		{
+			elog(DEBUG1, "pg_query_state: peer seems to have detached");
+			success = false;
+		}
+	}
+	PG_CATCH();
+	{
+		elog_dismiss(WARNING);
+		elog(WARNING, "failed to get QE query state");
+		success = false;
+	}
+	PG_END_TRY();
+	if (success)
+		elog(DEBUG1, "Segment: %u slice: %d send query state successfully", GpIdentity.segindex, sliceIndex);
+	return success;
+}
+
+/* copied from cbd_makeDispatchresults */
+static CdbDispatchResults*
+makeDispatchResults(SliceTable *table)
+{
+	CdbDispatchResults *results;
+	int resultCapacity = 0;
+	int nbytes;
+
+	for(int i = 0; i < table->numSlices; i++)
+	{
+		resultCapacity += table->slices[i].planNumSegments;
+	}
+
+	nbytes = resultCapacity * sizeof(results->resultArray[0]);
+
+	results = palloc0(sizeof(*results));
+	results->resultArray = palloc0(nbytes);
+	results->resultCapacity = resultCapacity;
+	results->resultCount = 0;
+	results->iFirstError = -1;
+	results->errcode = 0;
+	results->cancelOnError = true;
+
+	results->sliceMap = NULL;
+	results->sliceCapacity = table->numSlices;
+	if (resultCapacity > 0)
+	{
+		nbytes = resultCapacity * sizeof(results->sliceMap[0]);
+		results->sliceMap = palloc0(nbytes);
+	}
+	return results;
+}
+
+static bool
+send_cdbComponents_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
+{
+	bool res = query_state_pre_check(mqh, reqid, msg);
+	if (!res)
+		return res;
+	/* This function can only be called on QD */
+	if (Gp_role != GP_ROLE_DISPATCH)
+	{
+		res = false;
+		if (msg != NULL)
+			*msg = (shm_mq_msg){reqid, BASE_SIZEOF_SHM_MQ_MSG, MyProc, QUERY_NOT_RUNNING};
+	}
+	return res;
+
+}
+
+static  void
+set_msg(shm_mq_msg *msg, int reqid, PG_QS_RequestResult res)
+{
+	if (msg != NULL)
+		*msg = (shm_mq_msg){reqid, BASE_SIZEOF_SHM_MQ_MSG, MyProc, res};
+}
+static bool
+query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
+{
+	qs_query *query = NULL;
+	/* check if module is enabled */
+	if (!enable_qs_runtime())
+	{
+		set_msg(msg, reqid, STAT_DISABLED);
+		return false;
+	}
+	/* On QE, check if there is a cached querystate info */
+	if (Gp_role == GP_ROLE_EXECUTE && CachedQueryStateInfo != NULL )
+	{
+		return true;
+	}
+
+	/* no query running on QD/QE */
+	if (list_length(QueryDescStack) != 1)
+	{
+		set_msg(msg, reqid, QUERY_NOT_RUNNING);
+		return false;
+	}
+	query = get_query();
+	Assert(query && query->queryDesc);
+
+
+	if (!filter_running_query(query->queryDesc))
+	{
+		set_msg(msg, reqid, QUERY_NOT_RUNNING);
+		return false;
+	}
+	return true;
+}
+
+struct slice_result
+{
+	int sliceIndex;
+	int gp_command_count;
+	int queryId;
+	PGresult *pgresult;
+};
+
+/* Receive and process query stats from QE
+ *
+ * Firstly get the num of results as numresults
+ * Then transfrom the num of CdbExplain_StatHdr into pgresult
+ * and construct the CdbDispatcherState which is need by
+ * "ExplainPrintPlan"
+ *
+ * CdbExplain_StatHdr is saved in query_state_info.data
+ */
+static bool 
+receive_QE_query_state(shm_mq_handle *mqh, List **pgresults, int queryId)
+{
+	shm_mq_result mq_receive_result;
+	Size len;
+	query_state_info *seg_query_state_info;
+	int *numresults;
+	mq_receive_result = shm_mq_receive_with_timeout(mqh,
+													&len,
+													(void **)&numresults,
+													MAX_RCV_TIMEOUT);
+	if (mq_receive_result != SHM_MQ_SUCCESS)
+	{
+		/* counterpart is dead, not considering it */
+		elog(WARNING, "receive QE query state results failed through shm_mq");
+		return false;
+	}
+	if (*numresults <= 0)
+	{
+		return true;
+	}
+	for (int i = 0; i < *numresults; i++)
+	{
+		PGresult *pgresult = palloc(sizeof(PGresult));
+		int seg_command_count;
+		pgCdbStatCell *statcell = (pgCdbStatCell*)palloc(sizeof(pgCdbStatCell));
+
+		mq_receive_result = shm_mq_receive_with_timeout(mqh,
+														&len,
+														(void **)&seg_query_state_info,
+														MAX_RCV_TIMEOUT);
+		if (mq_receive_result != SHM_MQ_SUCCESS)
+		{
+			elog(WARNING, "receive QE query state results failed through shm_mq");
+			/* counterpart is dead, not considering it */
+			return false;
+		}
+		/*
+		 * Check if the query on segment is the same with the current query
+		 * There is the case when the query stat are collected from the segment,
+		 * QD has started to run the next query.
+		 */
+		seg_command_count = seg_query_state_info->gp_command_count;
+		if (seg_command_count != gp_command_count || seg_query_state_info->queryId != queryId)
+		{
+			elog(DEBUG1, "receive QE query state results command id or queryId is not correct");
+			continue;
+		}
+		/* transform CdbExplain_StatHdr to pgresult */
+		statcell->data = seg_query_state_info->data;
+		statcell->len = len - sizeof(query_state_info);
+		statcell->next = NULL;
+		pgresult->cdbstats = statcell;
+		struct slice_result *res = palloc(sizeof(struct slice_result));
+		res->sliceIndex = seg_query_state_info->sliceIndex;
+		res->pgresult = pgresult;
+		res->gp_command_count = seg_command_count;
+		res->queryId = seg_query_state_info->queryId;
+		*pgresults = lappend(*pgresults, res);
+		elog(DEBUG1, "receive QE query state %d successfully", res->sliceIndex);
+	}
+	return true;
+}
+
+static bool
+process_qe_query_state(CdbDispatcherState **disp_state, List *pgresults)
+{
+	QueryDesc *queryDesc;
+	EState *estate;
+	CdbDispatchResults *results;
+	*disp_state = NULL;
+	qs_query *query;
+	/* give spicify error code for it*/
+	if (list_length(QueryDescStack) != 1)
+	{
+		return false;
+	}
+	query = get_query();
+	Assert(query && query->queryDesc);
+	queryDesc = query->queryDesc;
+	/* The query maybe has been finished */
+	if (queryDesc == NULL || queryDesc->estate == NULL)
+	{
+		return true;
+	}
+	estate = queryDesc->estate;
+	/* first constuct a CdbDispatchResults */
+	results = makeDispatchResults(estate->es_sliceTable);
+	if (results->resultCapacity < list_length(pgresults))
+	{
+		/*
+		explain analyze select test_auto_stats_in_function('delete from t_test_auto_stats_in_function',
+                                   true, 't_test_auto_stats_in_function')*/
+		return true;
+	}
+	/* the pgresult of the same slice should be put in continous memory */
+	for(int i = 0 ; i < estate->es_sliceTable->numSlices; i++)
+	{
+		 ListCell   *c;
+		foreach(c, pgresults)
+		{
+			struct slice_result *res = (struct slice_result *)lfirst(c);
+			if(res->sliceIndex == i)
+			{
+				CdbDispatchResult *dispatchResult = cdbdisp_makeResult(results, NULL, res->sliceIndex);
+				cdbdisp_appendResult(dispatchResult, res->pgresult);
+			}
+		}
+	}
+	*disp_state = MemoryContextAllocZero(CurrentMemoryContext, sizeof(CdbDispatcherState));
+	(*disp_state)->primaryResults = results;
+	return true;	
+}
+
+static void 
+fill_segpid(CdbComponentDatabaseInfo *segInfo , backend_info *msg, int* index)
+{
+
+	ListCell *lc;
+	foreach (lc, segInfo->activelist)
+	{
+		gp_segment_pid *segpid = &msg->pids[(*index)++];
+		SegmentDatabaseDescriptor *dbdesc = (SegmentDatabaseDescriptor *)lfirst(lc);
+		segpid->pid = dbdesc->backendPid;
+		segpid->segid = dbdesc->segindex;
+	}
+}
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index 9519b32f435..211656afbfd 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -860,7 +860,6 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			ppkt->u.qlog.p_metrics.cpu_pct = pidrec->p_metrics.cpu_pct;
 			ppkt->u.qlog.p_metrics.fd_cnt = pidrec->p_metrics.fd_cnt;
 			ppkt->u.qlog.p_metrics.mem = pidrec->p_metrics.mem;
-			ppkt->u.qlog.pid = pidrec->pid;
 
 			TR2(("%s: SEND %d-%d-%d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
 				FLINE, ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt,
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index 4fe9fb87df1..beae07218cf 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -54,7 +54,7 @@ extern void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
 		const char *resqPriority,
 		int status);
 extern void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket);
-extern void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket);
+extern void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, bool updateRecord);
 extern void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket);
 extern void gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket);
 extern void gpmon_send(gpmon_packet_t*);
diff --git a/contrib/vectorization/src/backend/hook/explain.c b/contrib/vectorization/src/backend/hook/explain.c
new file mode 100644
index 00000000000..c79bb3ce869
--- /dev/null
+++ b/contrib/vectorization/src/backend/hook/explain.c
@@ -0,0 +1,6049 @@
+/*
+ * FIXME: This file will be deleted in the future
+ */
+#include "hook/hook.h"
+
+#include "libpq-fe.h"
+#include "cdb/cdbsubplan.h"
+#include "cdb/cdbdisp_query.h"
+#include "catalog/oid_dispatch.h"
+#include "access/xact.h"
+#include "executor/execUtils.h"
+#include "utils/snapmgr.h"
+#include "cdb/memquota.h"
+#include "cdb/cdbvars.h"
+#include "nodes/print.h"
+#include "commands/trigger.h"
+#include "cdb/cdbmotion.h"
+#include "cdb/ml_ipc.h"
+#include "utils/metrics_utils.h"
+#include "commands/copy.h"
+#include "commands/createas.h"
+#include "commands/matview.h"
+#include "foreign/fdwapi.h"
+#include "executor/nodeHash.h"
+#include "executor/nodeSubplan.h"
+#include "cdb/cdbexplain.h"
+#include "utils/guc_tables.h"
+#include "jit/jit.h"
+#include "storage/bufmgr.h"
+#include "cdb/cdbexplain.h"
+#include "cdb/cdbconn.h"
+#include <math.h>
+#include "utils/ruleutils.h"
+#include "utils/json.h"
+#include "cdb/cdbdisp.h"
+#include "utils/queryjumble.h"
+#include "nodes/makefuncs.h"
+#include "utils/lsyscache.h"
+#include "parser/parsetree.h"
+#include "utils/typcache.h"
+#include "utils/builtins.h"
+#include "nodes/extensible.h"
+#include "cdb/cdbendpoint.h"
+#include "vecnodes/nodes.h"
+
+#include "vecexecutor/executor.h"
+
+
+/* Convert bytes into kilobytes */
+#define kb(x) (floor((x + 1023.0) / 1024.0))
+
+/* OR-able flags for ExplainXMLTag() */
+#define X_OPENING 0
+#define X_CLOSING 1
+#define X_CLOSE_IMMEDIATE 2
+#define X_NOWHITESPACE 4
+
+static void VecExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc);
+static void VecExplainOpenGroup(const char *objtype, const char *labelname,
+								bool labeled, ExplainState *es);
+static void VecExplainCloseGroup(const char *objtype, const char *labelname,
+								 bool labeled, ExplainState *es);
+static void
+VecExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
+				  const char *queryString, ParamListInfo params,
+				  QueryEnvironment *queryEnv, const instr_time *planduration,
+				  const BufferUsage *bufusage,
+				  int cursorOptions);
+static void VecExplainNode(PlanState *planstate, List *ancestors,
+						   const char *relationship, const char *plan_name,
+						   ExplainState *es);
+static void VecExplainPrintSettings(ExplainState *es, PlanGenerator planGen);
+static void VecExplainSubPlans(List *plans, List *ancestors,
+							   const char *relationship, ExplainState *es,
+							   SliceTable *sliceTable);
+static void ExplainFlushWorkersState(ExplainState *es);
+static void
+ExplainXMLTag(const char *tagname, int flags, ExplainState *es);
+static void ExplainMemberNodes(PlanState **planstates, int nplans,
+							   List *ancestors, ExplainState *es);
+static void ExplainMissingMembers(int nplans, int nchildren, ExplainState *es);
+static void ExplainPrintJIT(ExplainState *es, int jit_flags,
+							JitInstrumentation *ji);
+static void ExplainOpenWorker(int n, ExplainState *es);
+static void ExplainCloseWorker(int n, ExplainState *es);
+static void ExplainSaveGroup(ExplainState *es, int depth, int *state_save);
+static void
+ExplainRestoreGroup(ExplainState *es, int depth, int *state_save);
+static void
+ExplainOpenSetAsideGroup(const char *objtype, const char *labelname,
+						 bool labeled, int depth, ExplainState *es);
+static void ExplainPropertyStringInfo(const char *qlabel, ExplainState *es,
+									  const char *fmt,...)
+									  pg_attribute_printf(3, 4);
+
+static void ExplainIndentText(ExplainState *es);
+static bool VecExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used);
+static ExplainWorkersState * ExplainCreateWorkersState(int num_workers);
+static void ExplainYAMLLineStarting(ExplainState *es);
+static void ExplainJSONLineEnding(ExplainState *es);
+static void
+ExplainCustomChildren(CustomScanState *css, List *ancestors, ExplainState *es);
+
+
+static double elapsed_time(instr_time *starttime);
+
+static void show_plan_tlist(PlanState *planstate, List *ancestors,
+							ExplainState *es);
+static void show_expression(Node *node, const char *qlabel,
+							PlanState *planstate, List *ancestors,
+							bool useprefix, ExplainState *es);
+static void show_qual(List *qual, const char *qlabel,
+					  PlanState *planstate, List *ancestors,
+					  bool useprefix, ExplainState *es);
+static void show_scan_qual(List *qual, const char *qlabel,
+						   PlanState *planstate, List *ancestors,
+						   ExplainState *es);
+static void show_upper_qual(List *qual, const char *qlabel,
+							PlanState *planstate, List *ancestors,
+							ExplainState *es);
+static void show_sort_keys(SortState *sortstate, List *ancestors,
+						   ExplainState *es);
+static void show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+									   List *ancestors, ExplainState *es);
+static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
+								   ExplainState *es);
+static void show_agg_keys(AggState *astate, List *ancestors,
+						  ExplainState *es);
+static void show_tuple_split_keys(TupleSplitState *tstate, List *ancestors,
+								  ExplainState *es);
+static void show_grouping_sets(PlanState *planstate, Agg *agg,
+							   List *ancestors, ExplainState *es);
+static void show_grouping_set_keys(PlanState *planstate,
+								   Agg *aggnode, Sort *sortnode,
+								   List *context, bool useprefix,
+								   List *ancestors, ExplainState *es);
+static void show_sort_group_keys(PlanState *planstate, const char *qlabel,
+								 int nkeys, int nPresortedKeys, AttrNumber *keycols,
+								 Oid *sortOperators, Oid *collations, bool *nullsFirst,
+								 List *ancestors, ExplainState *es);
+static void show_sortorder_options(StringInfo buf, Node *sortexpr,
+								   Oid sortOperator, Oid collation, bool nullsFirst);
+static void show_tablesample(TableSampleClause *tsc, PlanState *planstate,
+							 List *ancestors, ExplainState *es);
+static void show_sort_info(SortState *sortstate, ExplainState *es);
+static void show_windowagg_keys(WindowAggState *waggstate, List *ancestors, ExplainState *es);
+static void show_incremental_sort_info(IncrementalSortState *incrsortstate,
+									   ExplainState *es);
+static void show_hash_info(HashState *hashstate, ExplainState *es);
+static void show_runtime_filter_info(RuntimeFilterState *rfstate,
+									 ExplainState *es);
+static void show_memoize_info(MemoizeState *mstate, List *ancestors,
+							  ExplainState *es);
+static void show_hashagg_info(AggState *hashstate, ExplainState *es);
+static void show_tidbitmap_info(BitmapHeapScanState *planstate,
+								ExplainState *es);
+static void show_instrumentation_count(const char *qlabel, int which,
+									   PlanState *planstate, ExplainState *es);
+static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
+static void show_eval_params(Bitmapset *bms_params, ExplainState *es);
+static void show_join_pruning_info(List *join_prune_ids, ExplainState *es);
+static const char *explain_get_index_name(Oid indexId);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage,
+							  bool planning);
+static void show_wal_usage(ExplainState *es, const WalUsage *usage);
+static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir,
+									ExplainState *es);
+static void ExplainScanTarget(Scan *plan, ExplainState *es);
+static void ExplainModifyTarget(ModifyTable *plan, ExplainState *es);
+static void ExplainTargetRel(Plan *plan, Index rti, ExplainState *es);
+static void show_modifytable_info(ModifyTableState *mtstate, List *ancestors,
+								  ExplainState *es);
+
+/* explain_gp.c */
+static void cdbexplain_showExecStats(struct PlanState *planstate, ExplainState *es);
+static void cdbexplain_formatMemory(char *outbuf, int bufsize, double bytes);
+static void cdbexplain_formatSeg(char *outbuf, int bufsize, int segindex, int nInst);
+static void cdbexplain_formatSeconds(char *outbuf, int bufsize, double seconds, bool unit);
+static void cdbexplain_formatExtraText(StringInfo str, int indent, int segindex, const char *notes, int notelen);
+static bool nodeSupportWorkfileCaching(PlanState *planstate);
+static void cdbexplain_showExecStatsEnd(struct PlannedStmt *stmt,
+										struct CdbExplain_ShowStatCtx *showstatctx,
+										struct EState *estate,
+										ExplainState *es);
+static void
+show_dispatch_info(ExecSlice *slice, ExplainState *es, Plan *plan);
+
+static void show_motion_keys(PlanState *planstate, List *hashExpr, int nkeys,
+							 AttrNumber *keycols, const char *qlabel,
+							 List *ancestors, ExplainState *es);
+
+static void
+gpexplain_formatSlicesOutput(struct CdbExplain_ShowStatCtx *showstatctx,
+							struct EState *estate,
+							ExplainState *es);
+
+/* EXPLAIN ANALYZE statistics for one plan node of a slice */
+typedef struct CdbExplain_StatInst
+{
+	NodeTag		pstype;			/* PlanState node type */
+
+	/* fields from Instrumentation struct */
+	instr_time	starttime;		/* Start time of current iteration of node */
+	instr_time	counter;		/* Accumulated runtime for this node */
+	double		firsttuple;		/* Time for first tuple of this cycle */
+	double		startup;		/* Total startup time (in seconds) */
+	double		total;			/* Total total time (in seconds) */
+	double		ntuples;		/* Total tuples produced */
+	double		ntuples2;
+	double		nloops;			/* # of run cycles for this node */
+	double		nfiltered1;
+	double		nfiltered2;
+	double		execmemused;	/* executor memory used (bytes) */
+	double		workmemused;	/* work_mem actually used (bytes) */
+	double		workmemwanted;	/* work_mem to avoid workfile i/o (bytes) */
+	bool		workfileCreated;	/* workfile created in this node */
+	instr_time	firststart;		/* Start time of first iteration of node */
+	int			numPartScanned; /* Number of part tables scanned */
+
+	TuplesortInstrumentation sortstats; /* Sort stats, if this is a Sort node */
+	HashInstrumentation hashstats; /* Hash stats, if this is a Hash node */
+	IncrementalSortGroupInfo fullsortGroupInfo; /* Full sort group info for Incremental Sort node */
+	IncrementalSortGroupInfo prefixsortGroupInfo; /* Prefix sort group info for Incremental Sort node */
+	int			bnotes;			/* Offset to beginning of node's extra text */
+	int			enotes;			/* Offset to end of node's extra text */
+	int			nworkers_launched;	/* Number of workers launched for this node */
+	WalUsage	walusage;		/* add WAL usage */
+	/* fields from Instrumentation struct for one cycle of a node */
+	double tuplecount;
+	QueryMetricsStatus nodeStatus; /*CDB: stauts*/
+} CdbExplain_StatInst;
+
+/* EXPLAIN ANALYZE statistics for one process working on one slice */
+typedef struct CdbExplain_SliceWorker
+{
+	double		peakmemused;	/* bytes alloc in per-query mem context tree */
+	double		vmem_reserved;	/* vmem reserved by a QE */
+} CdbExplain_SliceWorker;
+
+/* Dispatch status summarized over workers in a slice */
+typedef struct CdbExplain_DispatchSummary
+{
+	int			nResult;
+	int			nOk;
+	int			nError;
+	int			nCanceled;
+	int			nNotDispatched;
+	int			nIgnorableError;
+} CdbExplain_DispatchSummary;
+
+
+/* One node's EXPLAIN ANALYZE statistics for all the workers of its segworker group */
+typedef struct CdbExplain_NodeSummary
+{
+	/* Summary over all the node's workers */
+	CdbExplain_Agg ntuples;
+	CdbExplain_Agg runtime_tupleAgg;
+	CdbExplain_Agg execmemused;
+	CdbExplain_Agg workmemused;
+	CdbExplain_Agg workmemwanted;
+	CdbExplain_Agg totalWorkfileCreated;
+	/* Used for DynamicSeqScan, DynamicIndexScan and DynamicBitmapHeapScan */
+	CdbExplain_Agg totalPartTableScanned;
+	/* Summary of space used by sort */
+	CdbExplain_Agg sortSpaceUsed[NUM_SORT_SPACE_TYPE][NUM_SORT_METHOD];
+
+	/* insts array info */
+	int			segindex0;		/* segment id of insts[0] */
+	int			ninst;			/* num of StatInst entries in inst array */
+
+	/* Array [0..ninst-1] of StatInst entries is appended starting here */
+	CdbExplain_StatInst insts[1];	/* variable size - must be last */
+} CdbExplain_NodeSummary;
+
+/* One slice's statistics for all the workers of its segworker group */
+typedef struct CdbExplain_SliceSummary
+{
+	ExecSlice  *slice;
+
+	/* worker array */
+	int			nworker;		/* num of SliceWorker slots in worker array */
+	int			segindex0;		/* segment id of workers[0] */
+	CdbExplain_SliceWorker *workers;	/* -> array [0..nworker-1] of
+										 * SliceWorker */
+	CdbExplain_Agg peakmemused; /* Summary of SliceWorker stats over all of
+								 * the slice's workers */
+
+	CdbExplain_Agg vmem_reserved;	/* vmem reserved by QEs */
+
+	/* Rollup of per-node stats over all of the slice's workers and nodes */
+	double		workmemused_max;
+	double		workmemwanted_max;
+
+	/* How many workers were dispatched and returned results? (0 if local) */
+	CdbExplain_DispatchSummary dispatchSummary;
+} CdbExplain_SliceSummary;
+
+
+/* State for cdbexplain_showExecStats() */
+typedef struct CdbExplain_ShowStatCtx
+{
+	StringInfoData extratextbuf;
+	instr_time	querystarttime;
+
+	/* Rollup of per-node stats over the entire query plan */
+	double		workmemused_max;
+	double		workmemwanted_max;
+
+	bool		stats_gathered;
+	/* Per-slice statistics are deposited in this SliceSummary array */
+	int			nslice;			/* num of slots in slices array */
+	CdbExplain_SliceSummary *slices;	/* -> array[0..nslice-1] of
+										 * SliceSummary */
+	bool		runtime;
+} CdbExplain_ShowStatCtx;
+
+void VecExplainOneQuery(Query *query, int cursorOptions,
+						IntoClause *into, ExplainState *es,
+						const char *queryString, ParamListInfo params,
+						QueryEnvironment *queryEnv)
+{
+
+	PlannedStmt *plan;
+	instr_time planstart, planduration;
+	BufferUsage bufusage_start, bufusage;
+	bool vec_type = false;
+
+	if (es->buffers)
+		bufusage_start = pgBufferUsage;
+	INSTR_TIME_SET_CURRENT(planstart);
+
+	/* plan the query */
+	plan = pg_plan_query(query, queryString, cursorOptions, params);
+
+	INSTR_TIME_SET_CURRENT(planduration);
+	INSTR_TIME_SUBTRACT(planduration, planstart);
+
+	if (plan->extensionContext)
+		vec_type = find_extension_context(plan->extensionContext);
+
+	if (!vec_type && vec_explain_prev) {
+		(*vec_explain_prev) (query, cursorOptions, into, es, queryString, params, queryEnv);
+		return;
+	}
+
+	/*
+		* GPDB_92_MERGE_FIXME: it really should be an optimizer's responsibility
+		* to correctly set the into-clause and into-policy of the PlannedStmt.
+		*/
+	if (into != NULL)
+		plan->intoClause = copyObject(into);
+
+	/* calc differences of buffer counters. */
+	if (es->buffers)
+	{
+		memset(&bufusage, 0, sizeof(BufferUsage));
+		BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
+	}
+
+	(vec_type ? VecExplainOnePlan : ExplainOnePlan)
+					(plan, into, es, queryString, params, queryEnv, 
+									&planduration, (es->buffers ? &bufusage : NULL), cursorOptions);
+}
+
+/* ----------------
+ * dummy Vec DestReceiver functions
+ * ----------------
+ */
+bool
+donothingVecReceive(TupleTableSlot *slot, DestReceiver *self)
+{
+	return true;
+}
+
+void
+donothingVecStartup(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+}
+
+void
+donothingVecCleanup(DestReceiver *self)
+{
+	/* this is used for both shutdown and destroy methods */
+}
+
+/* ----------------
+ * static DestReceiver structs for dest types needing no local state
+ * ----------------
+ */
+DestReceiver donothingVecDR = {
+	donothingVecReceive, donothingVecStartup, donothingVecCleanup, donothingVecCleanup,
+	DestNone
+};
+
+/*
+ * VecExplainOnePlan -
+ *		given a planned query, execute it if needed, and then print
+ *		EXPLAIN output
+ *
+ * "into" is NULL unless we are explaining the contents of a CreateTableAsStmt,
+ * in which case executing the query should result in creating that table.
+ *
+ * This is exported because it's called back from prepare.c in the
+ * EXPLAIN EXECUTE case, and because an index advisor plugin would need
+ * to call it.
+ */
+static void
+VecExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
+				  const char *queryString, ParamListInfo params,
+				  QueryEnvironment *queryEnv, const instr_time *planduration,
+				  const BufferUsage *bufusage,
+				  int cursorOptions)
+{
+	DestReceiver *dest;
+	QueryDesc  *queryDesc;
+	instr_time	starttime;
+	double		totaltime = 0;
+	int			eflags;
+	int			instrument_option = 0;
+
+	Assert(plannedstmt->commandType != CMD_UTILITY);
+
+	if (es->analyze && es->timing)
+		instrument_option |= INSTRUMENT_TIMER;
+	else if (es->analyze)
+		instrument_option |= INSTRUMENT_ROWS;
+
+	if (es->buffers)
+		instrument_option |= INSTRUMENT_BUFFERS;
+	if (es->wal)
+		instrument_option |= INSTRUMENT_WAL;
+
+	if (es->analyze)
+		instrument_option |= INSTRUMENT_CDB;
+
+	if (es->memory_detail)
+		instrument_option |= INSTRUMENT_MEMORY_DETAIL;
+
+	/*
+	 * We always collect timing for the entire statement, even when node-level
+	 * timing is off, so we don't look at es->timing here.  (We could skip
+	 * this if !es->summary, but it's hardly worth the complication.)
+	 */
+	INSTR_TIME_SET_CURRENT(starttime);
+
+	/*
+	 * Use a snapshot with an updated command ID to ensure this query sees
+	 * results of any previously executed queries.
+	 */
+	PushCopiedSnapshot(GetActiveSnapshot());
+	UpdateActiveSnapshotCommandId();
+
+	/*
+	 * Normally we discard the query's output, but if explaining CREATE TABLE
+	 * AS, we'd better use the appropriate tuple receiver.
+	 */
+	if (into)
+		dest = CreateIntoRelDestReceiver(into);
+	else
+		dest = (DestReceiver *)&donothingVecDR;
+
+	// GPDB_14_MERGE_FIXME: fix intoClause in optimizer
+	plannedstmt->intoClause = copyObject(into);
+	/* Create a QueryDesc for the query */
+	queryDesc = CreateQueryDesc(plannedstmt, queryString,
+								GetActiveSnapshot(), InvalidSnapshot,
+								dest, params, queryEnv, instrument_option);
+
+	/* GPDB hook for collecting query info */
+	if (query_info_collect_hook)
+		(*query_info_collect_hook)(METRICS_QUERY_SUBMIT, queryDesc);
+
+    /* Allocate workarea for summary stats. */
+	if (es->analyze)
+	{
+		/* Attach workarea to QueryDesc so ExecSetParamPlan() can find it. */
+		queryDesc->showstatctx = cdbexplain_showExecStatsBegin(queryDesc,
+																starttime);
+	}
+	else
+		queryDesc->showstatctx = NULL;
+
+	/* Select execution options */
+	if (es->analyze)
+		eflags = 0;				/* default run-to-completion flags */
+	else
+		eflags = EXEC_FLAG_EXPLAIN_ONLY;
+	if (into)
+		eflags |= GetIntoRelEFlags(into);
+
+	queryDesc->plannedstmt->query_mem =
+		ResourceManagerGetQueryMemoryLimit(queryDesc->plannedstmt);
+
+	/* call ExecutorStart to prepare the plan for execution */
+	ExecutorStartWrapper(queryDesc, eflags);
+
+	/* Execute the plan for statistics if asked for */
+	if (es->analyze)
+	{
+		ScanDirection dir;
+
+		/* EXPLAIN ANALYZE CREATE TABLE AS WITH NO DATA is weird */
+		if (into && into->skipData)
+			dir = NoMovementScanDirection;
+		else
+			dir = ForwardScanDirection;
+
+		/* run the plan */
+		ExecutorRunWrapper(queryDesc, dir, 0L, true);
+
+		/* Wait for completion of all qExec processes. */
+		if (queryDesc->estate->dispatcherState && queryDesc->estate->dispatcherState->primaryResults)
+			cdbdisp_checkDispatchResult(queryDesc->estate->dispatcherState, DISPATCH_WAIT_NONE);
+
+		/* run cleanup too */
+		ExecutorFinish(queryDesc);
+
+		/* We can't run ExecutorEnd 'till we're done printing the stats... */
+		totaltime += elapsed_time(&starttime);
+	}
+
+	VecExplainOpenGroup("Query", NULL, true, es);
+
+	/* Create textual dump of plan tree */
+	VecExplainPrintPlan(es, queryDesc);
+
+	if (cursorOptions & CURSOR_OPT_PARALLEL_RETRIEVE)
+		ExplainParallelRetrieveCursor(es, queryDesc);
+
+	/*
+	 * COMPUTE_QUERY_ID_REGRESS means COMPUTE_QUERY_ID_AUTO, but we don't show
+	 * the queryid in any of the EXPLAIN plans to keep stable the results
+	 * generated by regression test suites.
+	 */
+	if (es->verbose && plannedstmt->queryId != UINT64CONST(0) &&
+		compute_query_id != COMPUTE_QUERY_ID_REGRESS)
+	{
+		/*
+		 * Output the queryid as an int64 rather than a uint64 so we match
+		 * what would be seen in the BIGINT pg_stat_statements.queryid column.
+		 */
+		ExplainPropertyInteger("Query Identifier", NULL, (int64)
+							   plannedstmt->queryId, es);
+	}
+
+	/* Show buffer usage in planning */
+	if (bufusage)
+	{
+		VecExplainOpenGroup("Planning", "Planning", true, es);
+		show_buffer_usage(es, bufusage, true);
+		VecExplainCloseGroup("Planning", "Planning", true, es);
+	}
+
+	if (es->summary && planduration)
+	{
+		double		plantime = INSTR_TIME_GET_DOUBLE(*planduration);
+
+		ExplainPropertyFloat("Planning Time", "ms", 1000.0 * plantime, 3, es);
+	}
+
+	/* Print slice table */
+	if (es->slicetable)
+		ExplainPrintSliceTable(es, queryDesc);
+
+	/* Print info about runtime of triggers */
+	if (es->analyze)
+		ExplainPrintTriggers(es, queryDesc);
+
+	/*
+	 * Display per-slice and whole-query statistics.
+	 */
+	if (es->analyze)
+		cdbexplain_showExecStatsEnd(queryDesc->plannedstmt, queryDesc->showstatctx,
+									queryDesc->estate, es);
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		VecExplainOpenGroup("Settings", "Settings", true, es);
+
+		if (queryDesc->plannedstmt->planGen == PLANGEN_PLANNER)
+			ExplainPropertyStringInfo("Optimizer", es, "Postgres query optimizer");
+#ifdef USE_ORCA
+		else
+			ExplainPropertyStringInfo("Optimizer", es, "Pivotal Optimizer (GPORCA)");
+#endif
+
+		VecExplainCloseGroup("Settings", "Settings", true, es);
+	}
+
+	/*
+	 * Print info about JITing. Tied to es->costs because we don't want to
+	 * display this in regression tests, as it'd cause output differences
+	 * depending on build options.  Might want to separate that out from COSTS
+	 * at a later stage.
+	 */
+	if (es->costs)
+		ExplainPrintJITSummary(es, queryDesc);
+
+	/*
+	 * Close down the query and free resources.  Include time for this in the
+	 * total execution time (although it should be pretty minimal).
+	 */
+	INSTR_TIME_SET_CURRENT(starttime);
+
+	ExecutorEndWrapper(queryDesc);
+
+	FreeQueryDesc(queryDesc);
+
+	PopActiveSnapshot();
+
+	/* We need a CCI just in case query expanded to multiple plans */
+	if (es->analyze)
+		CommandCounterIncrement();
+
+	totaltime += elapsed_time(&starttime);
+
+	/*
+	 * We only report execution time if we actually ran the query (that is,
+	 * the user specified ANALYZE), and if summary reporting is enabled (the
+	 * user can set SUMMARY OFF to not have the timing information included in
+	 * the output).  By default, ANALYZE sets SUMMARY to true.
+	 */
+	if (es->summary && es->analyze)
+		ExplainPropertyFloat("Execution Time", "ms", 1000.0 * totaltime, 3,
+							 es);
+
+	VecExplainCloseGroup("Query", NULL, true, es);
+}
+
+/*
+ * Open a group of related objects.
+ *
+ * objtype is the type of the group object, labelname is its label within
+ * a containing object (if any).
+ *
+ * If labeled is true, the group members will be labeled properties,
+ * while if it's false, they'll be unlabeled objects.
+ */
+static void
+VecExplainOpenGroup(const char *objtype, const char *labelname,
+					bool labeled, ExplainState *es)
+{
+	switch (es->format)
+	{
+		case EXPLAIN_FORMAT_TEXT:
+			/* nothing to do */
+			break;
+
+		case EXPLAIN_FORMAT_XML:
+			ExplainXMLTag(objtype, X_OPENING, es);
+			es->indent++;
+			break;
+
+		case EXPLAIN_FORMAT_JSON:
+			ExplainJSONLineEnding(es);
+			appendStringInfoSpaces(es->str, 2 * es->indent);
+			if (labelname)
+			{
+				escape_json(es->str, labelname);
+				appendStringInfoString(es->str, ": ");
+			}
+			appendStringInfoChar(es->str, labeled ? '{' : '[');
+
+			/*
+			 * In JSON format, the grouping_stack is an integer list.  0 means
+			 * we've emitted nothing at this grouping level, 1 means we've
+			 * emitted something (and so the next item needs a comma). See
+			 * ExplainJSONLineEnding().
+			 */
+			es->grouping_stack = lcons_int(0, es->grouping_stack);
+			es->indent++;
+			break;
+
+		case EXPLAIN_FORMAT_YAML:
+
+			/*
+			 * In YAML format, the grouping stack is an integer list.  0 means
+			 * we've emitted nothing at this grouping level AND this grouping
+			 * level is unlabeled and must be marked with "- ".  See
+			 * ExplainYAMLLineStarting().
+			 */
+			ExplainYAMLLineStarting(es);
+			if (labelname)
+			{
+				appendStringInfo(es->str, "%s: ", labelname);
+				es->grouping_stack = lcons_int(1, es->grouping_stack);
+			}
+			else
+			{
+				appendStringInfoString(es->str, "- ");
+				es->grouping_stack = lcons_int(0, es->grouping_stack);
+			}
+			es->indent++;
+			break;
+	}
+}
+
+/*
+ * Close a group of related objects.
+ * Parameters must match the corresponding ExplainOpenGroup call.
+ */
+static void
+VecExplainCloseGroup(const char *objtype, const char *labelname,
+					 bool labeled, ExplainState *es)
+{
+	switch (es->format)
+	{
+		case EXPLAIN_FORMAT_TEXT:
+			/* nothing to do */
+			break;
+
+		case EXPLAIN_FORMAT_XML:
+			es->indent--;
+			ExplainXMLTag(objtype, X_CLOSING, es);
+			break;
+
+		case EXPLAIN_FORMAT_JSON:
+			es->indent--;
+			appendStringInfoChar(es->str, '\n');
+			appendStringInfoSpaces(es->str, 2 * es->indent);
+			appendStringInfoChar(es->str, labeled ? '}' : ']');
+			es->grouping_stack = list_delete_first(es->grouping_stack);
+			break;
+
+		case EXPLAIN_FORMAT_YAML:
+			es->indent--;
+			es->grouping_stack = list_delete_first(es->grouping_stack);
+			break;
+	}
+}
+
+/*
+ * VecExplainPrintPlan -
+ *	  convert a QueryDesc's plan tree to text and append it to es->str
+ *
+ * The caller should have set up the options fields of *es, as well as
+ * initializing the output buffer es->str.  Also, output formatting state
+ * such as the indent level is assumed valid.  Plan-tree-specific fields
+ * in *es are initialized here.
+ *
+ * NB: will not work on utility statements
+ */
+static void
+VecExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc)
+{
+	EState     *estate = queryDesc->estate;
+	Bitmapset  *rels_used = NULL;
+	PlanState  *ps;
+
+	/* Set up ExplainState fields associated with this plan tree */
+	Assert(queryDesc->plannedstmt != NULL);
+	es->pstmt = queryDesc->plannedstmt;
+	es->rtable = queryDesc->plannedstmt->rtable;
+	es->showstatctx = queryDesc->showstatctx;
+
+	/* CDB: Find slice table entry for the root slice. */
+	es->currentSlice = getCurrentSlice(estate, LocallyExecutingSliceIndex(estate));
+
+	/*
+	 * Get local stats if root slice was executed here in the qDisp, as long
+	 * as we haven't already gathered the statistics. This can happen when an
+	 * executor hook generates EXPLAIN output.
+	 */
+	if (es->analyze && !es->showstatctx->stats_gathered)
+	{
+		es->showstatctx->runtime = es->runtime;
+		if (Gp_role != GP_ROLE_EXECUTE && (!es->currentSlice || sliceRunsOnQD(es->currentSlice)))
+			cdbexplain_localExecStats(queryDesc->planstate, es->showstatctx);
+
+        /* Fill in the plan's Instrumentation with stats from qExecs. */
+		if (estate->dispatcherState && estate->dispatcherState->primaryResults)
+		{
+				cdbexplain_recvExecStats(queryDesc->planstate,
+										 estate->dispatcherState->primaryResults,
+										 LocallyExecutingSliceIndex(estate),
+										 es->showstatctx);
+		}
+	}
+
+	VecExplainPreScanNode(queryDesc->planstate, &rels_used);
+	es->rtable_names = select_rtable_names_for_explain(es->rtable, rels_used);
+	es->deparse_cxt = deparse_context_for_plan_tree(queryDesc->plannedstmt,
+													es->rtable_names);
+	es->printed_subplans = NULL;
+
+	/*
+	 * Sometimes we mark a Gather node as "invisible", which means that it's
+	 * not to be displayed in EXPLAIN output.  The purpose of this is to allow
+	 * running regression tests with force_parallel_mode=regress to get the
+	 * same results as running the same tests with force_parallel_mode=off.
+	 * Such marking is currently only supported on a Gather at the top of the
+	 * plan.  We skip that node, and we must also hide per-worker detail data
+	 * further down in the plan tree.
+	 */
+	ps = queryDesc->planstate;
+	if (IsA(ps, GatherState) && ((Gather *) ps->plan)->invisible)
+	{
+		ps = outerPlanState(ps);
+		es->hide_workers = true;
+	}
+	VecExplainNode(ps, NIL, NULL, NULL, es);
+
+	/*
+	 * If requested, include information about GUC parameters with values that
+	 * don't match the built-in defaults.
+	 */
+	VecExplainPrintSettings(es, queryDesc->plannedstmt->planGen);
+}
+
+/*
+ * VecExplainPreScanNode -
+ *	  Prescan the planstate tree to identify which RTEs are referenced
+ *
+ * Adds the relid of each referenced RTE to *rels_used.  The result controls
+ * which RTEs are assigned aliases by select_rtable_names_for_explain.
+ * This ensures that we don't confusingly assign un-suffixed aliases to RTEs
+ * that never appear in the EXPLAIN output (such as inheritance parents).
+ */
+static bool
+VecExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used)
+{
+	Plan	   *plan = planstate->plan;
+
+	switch (nodeTag(plan))
+	{
+		case T_SeqScan:
+		case T_SampleScan:
+		case T_IndexScan:
+		case T_IndexOnlyScan:
+		case T_BitmapHeapScan:
+		case T_TidScan:
+		case T_TidRangeScan:
+		case T_SubqueryScan:
+		case T_FunctionScan:
+		case T_TableFuncScan:
+		case T_ValuesScan:
+		case T_CteScan:
+		case T_NamedTuplestoreScan:
+		case T_WorkTableScan:
+		case T_ShareInputScan:
+			*rels_used = bms_add_member(*rels_used,
+										((Scan *) plan)->scanrelid);
+			break;
+		case T_ForeignScan:
+			*rels_used = bms_add_members(*rels_used,
+										 ((ForeignScan *) plan)->fs_relids);
+			break;
+		case T_CustomScan:
+			*rels_used = bms_add_members(*rels_used,
+										 ((CustomScan *) plan)->custom_relids);
+			break;
+		case T_ModifyTable:
+			*rels_used = bms_add_member(*rels_used,
+										((ModifyTable *) plan)->nominalRelation);
+			if (((ModifyTable *) plan)->exclRelRTI)
+				*rels_used = bms_add_member(*rels_used,
+											((ModifyTable *) plan)->exclRelRTI);
+			break;
+		case T_Append:
+			*rels_used = bms_add_members(*rels_used,
+										 ((Append *) plan)->apprelids);
+			break;
+		case T_MergeAppend:
+			*rels_used = bms_add_members(*rels_used,
+										 ((MergeAppend *) plan)->apprelids);
+			break;
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, VecExplainPreScanNode, rels_used);
+}
+
+
+/*
+ * VecExplainNode -
+ *	  Appends a description of a plan tree to es->str
+ *
+ * planstate points to the executor state node for the current plan node.
+ * We need to work from a PlanState node, not just a Plan node, in order to
+ * get at the instrumentation data (if any) as well as the list of subplans.
+ *
+ * ancestors is a list of parent Plan and SubPlan nodes, most-closely-nested
+ * first.  These are needed in order to interpret PARAM_EXEC Params.
+ *
+ * relationship describes the relationship of this plan node to its parent
+ * (eg, "Outer", "Inner"); it can be null at top level.  plan_name is an
+ * optional name to be attached to the node.
+ *
+ * In text format, es->indent is controlled in this function since we only
+ * want it to change at plan-node boundaries (but a few subroutines will
+ * transiently increment it).  In non-text formats, es->indent corresponds
+ * to the nesting depth of logical output groups, and therefore is controlled
+ * by VecExplainOpenGroup/VecExplainCloseGroup.
+ *
+ * es->parentPlanState points to the parent planstate node and can be used by
+ * PartitionSelector to deparse its printablePredicate. (This is passed in
+ * ExplainState rather than as a normal argument, to avoid changing the
+ * function signature from upstream.)
+ */
+static void
+VecExplainNode(PlanState *planstate, List *ancestors,
+			   const char *relationship, const char *plan_name,
+			   ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+	PlanState  *parentplanstate;
+	ExecSlice  *save_currentSlice = es->currentSlice;    /* save */
+	const char *pname;			/* node type name for text output */
+	const char *sname;			/* node type name for non-text output */
+	const char *strategy = NULL;
+	const char *partialmode = NULL;
+	const char *operation = NULL;
+	const char *custom_name = NULL;
+	ExplainWorkersState *save_workers_state = es->workers_state;
+	int			save_indent = es->indent;
+	bool		haschildren;
+	bool		skip_outer=false;
+	char       *skip_outer_msg = NULL;
+	int			motion_recv;
+	int			motion_snd;
+	ExecSlice  *parentSlice = NULL;
+	bool vec_type = find_extension_context(es->pstmt->extensionContext);
+
+	/* Remember who called us. */
+	parentplanstate = es->parentPlanState;
+	es->parentPlanState = planstate;
+
+	/*
+	 * If this is a Motion node, we're descending into a new slice.
+	 */
+	if (IsA(plan, Motion))
+	{
+		Motion	   *pMotion = (Motion *) plan;
+		SliceTable *sliceTable = planstate->state->es_sliceTable;
+
+		if (sliceTable)
+		{
+			es->currentSlice = &sliceTable->slices[pMotion->motionID];
+			parentSlice = es->currentSlice->parentIndex == -1 ? NULL :
+						  &sliceTable->slices[es->currentSlice->parentIndex];
+		}
+	}
+
+	/*
+	 * Prepare per-worker output buffers, if needed.  We'll append the data in
+	 * these to the main output string further down.
+	 */
+	if (planstate->worker_instrument && es->analyze && !es->hide_workers)
+		es->workers_state = ExplainCreateWorkersState(planstate->worker_instrument->num_workers);
+	else
+		es->workers_state = NULL;
+
+	/* Identify plan node type, and print generic details */
+	switch (nodeTag(plan))
+	{
+		case T_Result:
+			if (vec_type)
+			{
+				pname = sname = "Vec Result";
+			}
+			else
+			{
+				pname = sname = "Result";
+			}
+			break;
+		case T_ProjectSet:
+			pname = sname = "ProjectSet";
+			break;
+		case T_ModifyTable:
+			sname = "ModifyTable";
+			switch (((ModifyTable *) plan)->operation)
+			{
+				case CMD_INSERT:
+					pname = operation = "Insert";
+					break;
+				case CMD_UPDATE:
+					pname = operation = "Update";
+					break;
+				case CMD_DELETE:
+					pname = operation = "Delete";
+					break;
+				default:
+					pname = "???";
+					break;
+			}
+			break;
+		case T_Append:
+			if (vec_type)
+				pname = sname = "Vec Append";
+			else
+				pname = sname = "Append";
+			break;
+		case T_MergeAppend:
+			pname = sname = "Merge Append";
+			break;
+		case T_RecursiveUnion:
+			pname = sname = "Recursive Union";
+			break;
+		case T_Sequence:
+			if (vec_type)
+				pname = sname = "Vec Sequence";
+			else
+				pname = sname = "Sequence";
+			break;
+		case T_BitmapAnd:
+			pname = sname = "BitmapAnd";
+			break;
+		case T_BitmapOr:
+			pname = sname = "BitmapOr";
+			break;
+		case T_NestLoop:
+			if (vec_type) 
+				pname = sname = "Vec Nested Loop";
+			else 
+				pname = sname = "Nested Loop";
+			if (((NestLoop *)plan)->shared_outer)
+			{
+				skip_outer = true;
+				skip_outer_msg = "See first subplan of Hash Join";
+			}
+			break;
+		case T_MergeJoin:
+			pname = "Merge";	/* "Join" gets added by jointype switch */
+			sname = "Merge Join";
+			break;
+		case T_HashJoin:
+			if (vec_type) 
+			{
+				pname = "Vec Hash";		/* "Join" gets added by jointype switch */
+				sname = "Vec Hash Join";
+			}
+			else
+			{
+				pname = "Hash";		/* "Join" gets added by jointype switch */
+				sname = "Hash Join";
+			}
+			break;
+		case T_SeqScan:
+			if (vec_type) 
+			{
+				pname = sname = "Vec Seq Scan";
+			}
+			else
+			{
+				pname = sname = "Seq Scan";
+			}
+			break;
+		case T_SampleScan:
+			pname = sname = "Sample Scan";
+			break;
+		case T_Gather:
+			pname = sname = "Gather";
+			break;
+		case T_GatherMerge:
+			pname = sname = "Gather Merge";
+			break;
+		case T_IndexScan:
+			pname = sname = "Index Scan";
+			break;
+		case T_IndexOnlyScan:
+			pname = sname = "Index Only Scan";
+			break;
+		case T_BitmapIndexScan:
+			pname = sname = "Bitmap Index Scan";
+			break;
+		case T_BitmapHeapScan:
+			/*
+			 * We print "Bitmap Heap Scan", even for AO tables. It's a bit
+			 * confusing, but that's what the plan node is called, regardless
+			 * of the table type.
+			 */
+			pname = sname = "Bitmap Heap Scan";
+			break;
+		case T_TidScan:
+			pname = sname = "Tid Scan";
+			break;
+		case T_TidRangeScan:
+			pname = sname = "Tid Range Scan";
+			break;
+		case T_SubqueryScan:
+			if (vec_type)
+				pname = sname = "Vec Subquery Scan";
+			else 
+				pname = sname = "Subquery Scan";
+			break;
+		case T_FunctionScan:
+			pname = sname = "Function Scan";
+			break;
+		case T_TableFuncScan:
+			pname = sname = "Table Function Scan";
+			break;
+		case T_ValuesScan:
+			pname = sname = "Values Scan";
+			break;
+		case T_CteScan:
+			pname = sname = "CTE Scan";
+			break;
+		case T_NamedTuplestoreScan:
+			pname = sname = "Named Tuplestore Scan";
+			break;
+		case T_WorkTableScan:
+			pname = sname = "WorkTable Scan";
+			break;
+		case T_ShareInputScan:
+			if (vec_type)
+			{
+				pname = sname = "Vec Shared Scan";
+			}
+			else
+			{
+				pname = sname = "Shared Scan";
+			}
+			break;
+		case T_ForeignScan:
+			sname = "Foreign Scan";
+			switch (((ForeignScan *) plan)->operation)
+			{
+				case CMD_SELECT:
+					pname = "Foreign Scan";
+					operation = "Select";
+					break;
+				case CMD_INSERT:
+					pname = "Foreign Insert";
+					operation = "Insert";
+					break;
+				case CMD_UPDATE:
+					pname = "Foreign Update";
+					operation = "Update";
+					break;
+				case CMD_DELETE:
+					pname = "Foreign Delete";
+					operation = "Delete";
+					break;
+				default:
+					pname = "???";
+					break;
+			}
+			break;
+		case T_CustomScan:
+			sname = "Custom Scan";
+			custom_name = ((CustomScan *) plan)->methods->CustomName;
+			if (custom_name)
+				pname = psprintf("Custom Scan (%s)", custom_name);
+			else
+				pname = sname;
+			break;
+		case T_Material:
+			if (vec_type)
+			{
+				pname = sname = "Vec Materialize";
+			}
+			else
+			{
+				pname = sname = "Materialize";
+			}
+			break;
+		case T_Memoize:
+			pname = sname = "Memoize";
+			break;
+		case T_Sort:
+			if (vec_type) 
+			{
+				pname = sname = "Vec Sort";
+			}
+			else 
+			{
+				pname = sname = "Sort";
+			}
+			break;
+		case T_TupleSplit:
+			pname = sname = "TupleSplit";
+			break;
+		case T_IncrementalSort:
+			pname = sname = "Incremental Sort";
+			break;
+		case T_Group:
+			pname = sname = "Group";
+			break;
+		case T_Agg:
+			if (vec_type)
+			{
+				Agg		   *agg = (Agg *) plan;
+
+				sname = "Aggregate";
+				switch (agg->aggstrategy)
+				{
+					case AGG_PLAIN:
+						pname = "Vec Aggregate";
+						strategy = "Plain";
+						break;
+					case AGG_SORTED:
+						pname = "Vec GroupAggregate";
+						strategy = "Sorted";
+						break;
+					case AGG_HASHED:
+						pname = "Vec HashAggregate";
+						strategy = "Hashed";
+						break;
+					case AGG_MIXED:
+						pname = "Vec MixedAggregate";
+						strategy = "Mixed";
+						break;
+					default:
+						pname = "Vec Aggregate ???";
+						strategy = "???";
+						break;
+				}
+
+				if (DO_AGGSPLIT_SKIPFINAL(agg->aggsplit))
+				{
+					partialmode = "Vec Partial";
+					pname = psprintf("%s %s", partialmode, pname);
+				}
+				else if (DO_AGGSPLIT_COMBINE(agg->aggsplit))
+				{
+					partialmode = "Vec Finalize";
+					pname = psprintf("%s %s", partialmode, pname);
+				}
+				else
+					partialmode = "Vec Simple";
+
+				if (agg->streaming)
+					pname = psprintf("Vec Streaming %s", pname);
+			}
+			else
+			{
+				Agg		   *agg = (Agg *) plan;
+
+				sname = "Aggregate";
+				switch (agg->aggstrategy)
+				{
+					case AGG_PLAIN:
+						pname = "Aggregate";
+						strategy = "Plain";
+						break;
+					case AGG_SORTED:
+						pname = "GroupAggregate";
+						strategy = "Sorted";
+						break;
+					case AGG_HASHED:
+						pname = "HashAggregate";
+						strategy = "Hashed";
+						break;
+					case AGG_MIXED:
+						pname = "MixedAggregate";
+						strategy = "Mixed";
+						break;
+					default:
+						pname = "Aggregate ???";
+						strategy = "???";
+						break;
+				}
+
+				if (DO_AGGSPLIT_SKIPFINAL(agg->aggsplit))
+				{
+					partialmode = "Partial";
+					pname = psprintf("%s %s", partialmode, pname);
+				}
+				else if (DO_AGGSPLIT_COMBINE(agg->aggsplit))
+				{
+					partialmode = "Finalize";
+					pname = psprintf("%s %s", partialmode, pname);
+				}
+				else
+					partialmode = "Simple";
+
+				if (agg->streaming)
+					pname = psprintf("Streaming %s", pname);
+			}
+			break;
+		case T_WindowAgg:
+			if (vec_type)
+			{
+				pname = sname = "Vec WindowAgg";
+			}
+			else
+			{
+				pname = sname = "WindowAgg";
+			}
+
+			break;
+		case T_TableFunctionScan:
+			pname = sname = "Table Function Scan";
+			break;
+		case T_Unique:
+			pname = sname = "Unique";
+			break;
+		case T_SetOp:
+			sname = "SetOp";
+			switch (((SetOp *) plan)->strategy)
+			{
+				case SETOP_SORTED:
+					pname = "SetOp";
+					strategy = "Sorted";
+					break;
+				case SETOP_HASHED:
+					pname = "HashSetOp";
+					strategy = "Hashed";
+					break;
+				default:
+					pname = "SetOp ???";
+					strategy = "???";
+					break;
+			}
+			break;
+		case T_LockRows:
+			pname = sname = "LockRows";
+			break;
+		case T_RuntimeFilter:
+			pname = sname = "RuntimeFilter";
+			break;
+		case T_Limit:
+			if (vec_type)
+			{
+				pname = sname = "Vec Limit";
+			}
+			else
+			{
+				pname = sname = "Limit";
+			}
+			break;
+		case T_Hash:
+			if (vec_type)
+				pname = sname = "Vec Hash";
+			else
+				pname = sname = "Hash";
+			break;
+		case T_Motion:
+			{
+				Motion		*pMotion = (Motion *) plan;
+
+				Assert(plan->lefttree);
+
+				motion_snd = list_length(es->currentSlice->segments);
+				motion_recv = parentSlice == NULL ? 1 : list_length(parentSlice->segments);
+
+				if (vec_type) 
+				{
+					switch (pMotion->motionType)
+					{
+						case MOTIONTYPE_GATHER:
+							sname = "Vec Gather Motion";
+							motion_recv = 1;
+							break;
+						case MOTIONTYPE_GATHER_SINGLE:
+							sname = "Vec Explicit Gather Motion";
+							motion_recv = 1;
+							break;
+						case MOTIONTYPE_HASH:
+							sname = "Vec Redistribute Motion";
+							break;
+						case MOTIONTYPE_BROADCAST:
+							sname = "Vec Broadcast Motion";
+							break;
+						case MOTIONTYPE_EXPLICIT:
+							sname = "Vec Explicit Redistribute Motion";
+							break;
+						default:
+							sname = "???";
+							motion_recv = -1;
+							break;
+					}
+				}
+				else
+				{
+					switch (pMotion->motionType)
+					{
+						case MOTIONTYPE_GATHER:
+							sname = "Gather Motion";
+							motion_recv = 1;
+							break;
+						case MOTIONTYPE_GATHER_SINGLE:
+							sname = "Explicit Gather Motion";
+							motion_recv = 1;
+							break;
+						case MOTIONTYPE_HASH:
+							sname = "Redistribute Motion";
+							break;
+						case MOTIONTYPE_BROADCAST:
+							sname = "Broadcast Motion";
+							break;
+						case MOTIONTYPE_EXPLICIT:
+							sname = "Explicit Redistribute Motion";
+							break;
+						default:
+							sname = "???";
+							motion_recv = -1;
+							break;
+					}
+				}
+
+				pname = psprintf("%s %d:%d", sname, motion_snd, motion_recv);
+			}
+			break;
+		case T_SplitUpdate:
+			pname = sname = "Split";
+			break;
+		case T_AssertOp:
+			if (vec_type)
+			{
+				pname = sname = "Vec Assert";
+			}
+			else
+			{
+				pname = sname = "Assert";
+			}
+			break;
+		case T_PartitionSelector:
+			pname = sname = "Partition Selector";
+			break;
+		default:
+			pname = sname = "???";
+			break;
+		}
+
+	ExplainOpenGroup("Plan",
+					 relationship ? NULL : "Plan",
+					 true, es);
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		if (plan_name)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "%s", plan_name);
+
+			/*
+			 * If this SubPlan is being dispatched separately, show slice
+			 * information after the plan name. Currently, we do this for
+			 * Init Plans.
+			 *
+			 * Note: If the top node was a Motion node, we print the slice
+			 * *above* the Motion here. We will print the slice below the
+			 * Motion, below.
+			 */
+			if (es->subplanDispatchedSeparately)
+				show_dispatch_info(save_currentSlice, es, plan);
+			appendStringInfoChar(es->str, '\n');
+			es->indent++;
+		}
+		if (es->indent)
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "->  ");
+			es->indent += 2;
+		}
+		if (plan->parallel_aware)
+			appendStringInfoString(es->str, "Parallel ");
+		if (plan->async_capable)
+			appendStringInfoString(es->str, "Async ");
+		appendStringInfoString(es->str, pname);
+
+		/*
+		 * Print information about the current slice. In order to not make
+		 * the output too verbose, only print it at the slice boundaries,
+		 * ie. at Motion nodes. (We already switched the "current slice"
+		 * to the slice below the Motion.)
+		 */
+		if (IsA(plan, Motion))
+			show_dispatch_info(es->currentSlice, es, plan);
+
+		es->indent++;
+	}
+	else
+	{
+		ExplainPropertyText("Node Type", sname, es);
+		if (nodeTag(plan) == T_Motion)
+		{
+			ExplainPropertyInteger("Senders", NULL, motion_snd, es);
+			ExplainPropertyInteger("Receivers", NULL, motion_recv, es);
+		}
+		if (strategy)
+			ExplainPropertyText("Strategy", strategy, es);
+		if (partialmode)
+			ExplainPropertyText("Partial Mode", partialmode, es);
+		if (operation)
+			ExplainPropertyText("Operation", operation, es);
+		if (relationship)
+			ExplainPropertyText("Parent Relationship", relationship, es);
+		if (plan_name)
+			ExplainPropertyText("Subplan Name", plan_name, es);
+		if (custom_name)
+			ExplainPropertyText("Custom Plan Provider", custom_name, es);
+
+		show_dispatch_info(es->currentSlice, es, plan);
+		ExplainPropertyBool("Parallel Aware", plan->parallel_aware, es);
+		ExplainPropertyBool("Async Capable", plan->async_capable, es);
+	}
+
+	switch (nodeTag(plan))
+	{
+		case T_SeqScan:
+		case T_SampleScan:
+		case T_BitmapHeapScan:
+		case T_TidScan:
+		case T_TidRangeScan:
+		case T_SubqueryScan:
+		case T_FunctionScan:
+		case T_TableFunctionScan:
+		case T_TableFuncScan:
+		case T_ValuesScan:
+		case T_CteScan:
+		case T_WorkTableScan:
+			ExplainScanTarget((Scan *) plan, es);
+			break;
+		case T_ForeignScan:
+		case T_CustomScan:
+			if (((Scan *) plan)->scanrelid > 0)
+				ExplainScanTarget((Scan *) plan, es);
+			break;
+		case T_IndexScan:
+			{
+				IndexScan  *indexscan = (IndexScan *) plan;
+
+				ExplainIndexScanDetails(indexscan->indexid,
+										indexscan->indexorderdir,
+										es);
+				ExplainScanTarget((Scan *) indexscan, es);
+			}
+			break;
+		case T_IndexOnlyScan:
+			{
+				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+
+				ExplainIndexScanDetails(indexonlyscan->indexid,
+										indexonlyscan->indexorderdir,
+										es);
+				ExplainScanTarget((Scan *) indexonlyscan, es);
+			}
+			break;
+		case T_BitmapIndexScan:
+			{
+				BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan;
+				const char *indexname =
+				explain_get_index_name(bitmapindexscan->indexid);
+
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+					appendStringInfo(es->str, " on %s",
+									 quote_identifier(indexname));
+				else
+					ExplainPropertyText("Index Name", indexname, es);
+			}
+			break;
+		case T_ModifyTable:
+			ExplainModifyTarget((ModifyTable *) plan, es);
+			break;
+		case T_NestLoop:
+		case T_MergeJoin:
+		case T_HashJoin:
+			{
+				const char *jointype;
+
+				switch (((Join *) plan)->jointype)
+				{
+					case JOIN_INNER:
+						jointype = "Inner";
+						break;
+					case JOIN_LEFT:
+						jointype = "Left";
+						break;
+					case JOIN_FULL:
+						jointype = "Full";
+						break;
+					case JOIN_RIGHT:
+						jointype = "Right";
+						break;
+					case JOIN_SEMI:
+						jointype = "Semi";
+						break;
+					case JOIN_ANTI:
+						jointype = "Anti";
+						break;
+					case JOIN_LASJ_NOTIN:
+						jointype = "Left Anti Semi (Not-In)";
+						break;
+					default:
+						jointype = "???";
+						break;
+				}
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+				{
+					/*
+					 * For historical reasons, the join type is interpolated
+					 * into the node type name...
+					 */
+					if (((Join *) plan)->jointype != JOIN_INNER)
+						appendStringInfo(es->str, " %s Join", jointype);
+					else if (!IsA(plan, NestLoop))
+						appendStringInfoString(es->str, " Join");
+				}
+				else
+					ExplainPropertyText("Join Type", jointype, es);
+			}
+			break;
+		case T_SetOp:
+			{
+				const char *setopcmd;
+
+				switch (((SetOp *) plan)->cmd)
+				{
+					case SETOPCMD_INTERSECT:
+						setopcmd = "Intersect";
+						break;
+					case SETOPCMD_INTERSECT_ALL:
+						setopcmd = "Intersect All";
+						break;
+					case SETOPCMD_EXCEPT:
+						setopcmd = "Except";
+						break;
+					case SETOPCMD_EXCEPT_ALL:
+						setopcmd = "Except All";
+						break;
+					default:
+						setopcmd = "???";
+						break;
+				}
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+					appendStringInfo(es->str, " %s", setopcmd);
+				else
+					ExplainPropertyText("Command", setopcmd, es);
+			}
+			break;
+		case T_ShareInputScan:
+			{
+				ShareInputScan *sisc = (ShareInputScan *) plan;
+				int				slice_id = -1;
+
+				if (es->currentSlice)
+					slice_id = es->currentSlice->sliceIndex;
+
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+					appendStringInfo(es->str, " (share slice:id %d:%d)",
+									 slice_id, sisc->share_id);
+				else
+				{
+					ExplainPropertyInteger("Share ID", NULL, sisc->share_id, es);
+					ExplainPropertyInteger("Slice ID", NULL, slice_id, es);
+				}
+			}
+			break;
+		case T_PartitionSelector:
+			{
+				PartitionSelector *ps = (PartitionSelector *) plan;
+
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+				{
+					appendStringInfo(es->str, " (selector id: $%d)", ps->paramid);
+				}
+				else
+				{
+					ExplainPropertyInteger("Selector ID", NULL, ps->paramid, es);
+				}
+			}
+			break;
+		default:
+			break;
+	}
+
+	if (es->costs)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfo(es->str, "  (cost=%.2f..%.2f rows=%.0f width=%d)",
+							 plan->startup_cost, plan->total_cost,
+							 plan->plan_rows, plan->plan_width);
+		}
+		else
+		{
+			ExplainPropertyFloat("Startup Cost", NULL, plan->startup_cost,
+								 2, es);
+			ExplainPropertyFloat("Total Cost", NULL, plan->total_cost,
+								 2, es);
+			ExplainPropertyFloat("Plan Rows", NULL, plan->plan_rows,
+								 0, es);
+			ExplainPropertyInteger("Plan Width", NULL, plan->plan_width,
+								   es);
+		}
+	}
+
+	if (ResManagerPrintOperatorMemoryLimits())
+	{
+		ExplainPropertyInteger("operatorMem", "kB", PlanStateOperatorMemKB(planstate), es);
+	}
+	/*
+	 * We have to forcibly clean up the instrumentation state because we
+	 * haven't done ExecutorEnd yet.  This is pretty grotty ...
+	 *
+	 * Note: contrib/auto_explain could cause instrumentation to be set up
+	 * even though we didn't ask for it here.  Be careful not to print any
+	 * instrumentation results the user didn't ask for.  But we do the
+	 * InstrEndLoop call anyway, if possible, to reduce the number of cases
+	 * auto_explain has to contend with.
+	 */
+	if (planstate->instrument && !es->runtime)
+		InstrEndLoop(planstate->instrument);
+
+	/* GPDB_90_MERGE_FIXME: In GPDB, these are printed differently. But does that work
+	 * with the new XML/YAML EXPLAIN output */
+	if (es->analyze &&
+		planstate->instrument && planstate->instrument->nloops > 0)
+	{
+		double		nloops = planstate->instrument->nloops;
+		double		startup_ms = 1000.0 * planstate->instrument->startup / nloops;
+		double		total_ms = 1000.0 * planstate->instrument->total / nloops;
+		double		rows = planstate->instrument->ntuples / nloops;
+
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			if (es->timing)
+				appendStringInfo(es->str,
+								 " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)",
+								 startup_ms, total_ms, rows, nloops);
+			else
+				appendStringInfo(es->str,
+								 " (actual rows=%.0f loops=%.0f)",
+								 rows, nloops);
+		}
+		else
+		{
+			if (es->timing)
+			{
+				ExplainPropertyFloat("Actual Startup Time", "ms", startup_ms,
+									 3, es);
+				ExplainPropertyFloat("Actual Total Time", "ms", total_ms,
+									 3, es);
+			}
+			ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
+			ExplainPropertyFloat("Actual Loops", NULL, nloops, 0, es);
+		}
+	}
+	else if (es->analyze && !es->runtime)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+			appendStringInfoString(es->str, " (never executed)");
+		else
+		{
+			if (es->timing)
+			{
+				ExplainPropertyFloat("Actual Startup Time", "ms", 0.0, 3, es);
+				ExplainPropertyFloat("Actual Total Time", "ms", 0.0, 3, es);
+			}
+			ExplainPropertyFloat("Actual Rows", NULL, 0.0, 0, es);
+			ExplainPropertyFloat("Actual Loops", NULL, 0.0, 0, es);
+		}
+	}
+/*
+	 * Print the progress of node execution at current loop.
+	 */
+	if (planstate->instrument && es->analyze && es->runtime)
+	{
+		instr_time	starttimespan;
+		double	startup_sec;
+		double	total_sec;
+		double	rows;
+		double	loop_num;
+		char 	*status;
+
+		if (!INSTR_TIME_IS_ZERO(planstate->instrument->rt_starttime))
+		{
+			INSTR_TIME_SET_CURRENT(starttimespan);
+			INSTR_TIME_SUBTRACT(starttimespan, planstate->instrument->rt_starttime);
+		}
+		else
+			INSTR_TIME_SET_ZERO(starttimespan);
+		startup_sec = 1000.0 * planstate->instrument->rt_firsttuple;
+		total_sec = 1000.0 * (INSTR_TIME_GET_DOUBLE(planstate->instrument->rt_counter)
+							+ INSTR_TIME_GET_DOUBLE(starttimespan));
+		rows = planstate->instrument->rt_tuplecount;
+		loop_num = planstate->instrument->nloops + 1;
+
+		switch (planstate->instrument->nodeStatus)
+		{
+			case METRICS_PLAN_NODE_INITIALIZE:
+				status = &("Initialize"[0]);
+				break;
+			case METRICS_PLAN_NODE_EXECUTING:
+				status = &("Executing"[0]);
+				break;
+			case METRICS_PLAN_NODE_FINISHED:
+				status = &("Finished"[0]);
+				break;
+			default:
+				status = &("Unknown"[0]);
+				break;
+		}
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfo(es->str,
+							 " (node status: %s)", status);
+		}
+		else
+		{
+			ExplainPropertyText("Node status", status, es);
+		}
+
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			if (es->timing)
+			{
+				if (planstate->instrument->running)
+					appendStringInfo(es->str,
+									 " (actual time=%.3f..%.3f rows=%.0f, loops=%.0f)",
+									 startup_sec, total_sec, rows, loop_num);
+				else
+					appendStringInfo(es->str,
+									 " (actual time=%.3f rows=0, loops=%.0f)",
+									 total_sec, loop_num);
+			}
+			else
+				appendStringInfo(es->str,
+								 " (actual rows=%.0f, loops=%.0f)",
+								 rows, loop_num);
+		}
+		else
+		{
+			if (es->timing)
+			{
+				if (planstate->instrument->running)
+				{
+					ExplainPropertyFloat("Actual Startup Time", NULL, startup_sec, 3, es);
+					ExplainPropertyFloat("Actual Total Time", NULL, total_sec, 3, es);
+				}
+				else
+					ExplainPropertyFloat("Running Time", NULL, total_sec, 3, es);
+			}
+			ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
+			ExplainPropertyFloat("Actual Loops", NULL, loop_num, 0, es);
+		}
+	}
+
+	/* in text format, first line ends here */
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+		appendStringInfoChar(es->str, '\n');
+
+	/* prepare per-worker general execution details */
+	if (es->workers_state && es->verbose)
+	{
+		WorkerInstrumentation *w = planstate->worker_instrument;
+
+		for (int n = 0; n < w->num_workers; n++)
+		{
+			Instrumentation *instrument = &w->instrument[n];
+			double		nloops = instrument->nloops;
+			double		startup_ms;
+			double		total_ms;
+			double		rows;
+
+			if (nloops <= 0)
+				continue;
+			startup_ms = 1000.0 * instrument->startup / nloops;
+			total_ms = 1000.0 * instrument->total / nloops;
+			rows = instrument->ntuples / nloops;
+
+			ExplainOpenWorker(n, es);
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+				if (es->timing)
+					appendStringInfo(es->str,
+									 "actual time=%.3f..%.3f rows=%.0f loops=%.0f\n",
+									 startup_ms, total_ms, rows, nloops);
+				else
+					appendStringInfo(es->str,
+									 "actual rows=%.0f loops=%.0f\n",
+									 rows, nloops);
+			}
+			else
+			{
+				if (es->timing)
+				{
+					ExplainPropertyFloat("Actual Startup Time", "ms",
+										 startup_ms, 3, es);
+					ExplainPropertyFloat("Actual Total Time", "ms",
+										 total_ms, 3, es);
+				}
+				ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
+				ExplainPropertyFloat("Actual Loops", NULL, nloops, 0, es);
+			}
+
+			ExplainCloseWorker(n, es);
+		}
+	}
+
+	/* target list */
+	if (es->verbose)
+		show_plan_tlist(planstate, ancestors, es);
+
+	/* unique join */
+	switch (nodeTag(plan))
+	{
+		case T_NestLoop:
+		case T_MergeJoin:
+		case T_HashJoin:
+			/* try not to be too chatty about this in text mode */
+			if (es->format != EXPLAIN_FORMAT_TEXT ||
+				(es->verbose && ((Join *) plan)->inner_unique))
+				ExplainPropertyBool("Inner Unique",
+									((Join *) plan)->inner_unique,
+									es);
+			break;
+		default:
+			break;
+	}
+
+	/* quals, sort keys, etc */
+	switch (nodeTag(plan))
+	{
+		case T_IndexScan:
+			show_scan_qual(((IndexScan *) plan)->indexqualorig,
+						   "Index Cond", planstate, ancestors, es);
+			if (((IndexScan *) plan)->indexqualorig)
+				show_instrumentation_count("Rows Removed by Index Recheck", 2,
+										   planstate, es);
+			show_scan_qual(((IndexScan *) plan)->indexorderbyorig,
+						   "Order By", planstate, ancestors, es);
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+		case T_IndexOnlyScan:
+			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
+						   "Index Cond", planstate, ancestors, es);
+			if (((IndexOnlyScan *) plan)->recheckqual)
+				show_instrumentation_count("Rows Removed by Index Recheck", 2,
+										   planstate, es);
+			show_scan_qual(((IndexOnlyScan *) plan)->indexorderby,
+						   "Order By", planstate, ancestors, es);
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			if (es->analyze)
+				ExplainPropertyFloat("Heap Fetches", NULL,
+									 planstate->instrument->ntuples2, 0, es);
+			break;
+		case T_BitmapIndexScan:
+			show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
+						   "Index Cond", planstate, ancestors, es);
+			break;
+		case T_BitmapHeapScan:
+		{
+			List		*bitmapqualorig;
+
+			bitmapqualorig = ((BitmapHeapScan *) plan)->bitmapqualorig;
+
+			show_scan_qual(bitmapqualorig,
+						   "Recheck Cond", planstate, ancestors, es);
+
+			if (bitmapqualorig)
+				show_instrumentation_count("Rows Removed by Index Recheck", 2,
+										   planstate, es);
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			if (es->analyze)
+				show_tidbitmap_info((BitmapHeapScanState *) planstate, es);
+			break;
+		}
+		case T_SampleScan:
+			show_tablesample(((SampleScan *) plan)->tablesample,
+							 planstate, ancestors, es);
+			/* fall through to print additional fields the same as SeqScan */
+			/* FALLTHROUGH */
+		case T_SeqScan:
+		case T_ValuesScan:
+		case T_CteScan:
+		case T_NamedTuplestoreScan:
+		case T_WorkTableScan:
+		case T_SubqueryScan:
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+		case T_Gather:
+			{
+				Gather	   *gather = (Gather *) plan;
+
+				show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+				if (plan->qual)
+					show_instrumentation_count("Rows Removed by Filter", 1,
+											   planstate, es);
+				ExplainPropertyInteger("Workers Planned", NULL,
+									   gather->num_workers, es);
+
+				/* Show params evaluated at gather node */
+				if (gather->initParam)
+					show_eval_params(gather->initParam, es);
+
+				if (es->analyze)
+				{
+					int			nworkers;
+
+					nworkers = ((GatherState *) planstate)->nworkers_launched;
+					ExplainPropertyInteger("Workers Launched", NULL,
+										   nworkers, es);
+				}
+
+				if (gather->single_copy || es->format != EXPLAIN_FORMAT_TEXT)
+					ExplainPropertyBool("Single Copy", gather->single_copy, es);
+			}
+			break;
+		case T_GatherMerge:
+			{
+				GatherMerge *gm = (GatherMerge *) plan;
+
+				show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+				if (plan->qual)
+					show_instrumentation_count("Rows Removed by Filter", 1,
+											   planstate, es);
+				ExplainPropertyInteger("Workers Planned", NULL,
+									   gm->num_workers, es);
+
+				/* Show params evaluated at gather-merge node */
+				if (gm->initParam)
+					show_eval_params(gm->initParam, es);
+
+				if (es->analyze)
+				{
+					int			nworkers;
+
+					nworkers = ((GatherMergeState *) planstate)->nworkers_launched;
+					ExplainPropertyInteger("Workers Launched", NULL,
+										   nworkers, es);
+				}
+			}
+			break;
+		case T_FunctionScan:
+			if (es->verbose)
+			{
+				List	   *fexprs = NIL;
+				ListCell   *lc;
+
+				foreach(lc, ((FunctionScan *) plan)->functions)
+				{
+					RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+
+					fexprs = lappend(fexprs, rtfunc->funcexpr);
+				}
+				/* We rely on show_expression to insert commas as needed */
+				show_expression((Node *) fexprs,
+								"Function Call", planstate, ancestors,
+								es->verbose, es);
+			}
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+		case T_TableFuncScan:
+			if (es->verbose)
+			{
+				TableFunc  *tablefunc = ((TableFuncScan *) plan)->tablefunc;
+
+				show_expression((Node *) tablefunc,
+								"Table Function Call", planstate, ancestors,
+								es->verbose, es);
+			}
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+		case T_TidScan:
+			{
+				/*
+				 * The tidquals list has OR semantics, so be sure to show it
+				 * as an OR condition.
+				 */
+				List	   *tidquals = ((TidScan *) plan)->tidquals;
+
+				if (list_length(tidquals) > 1)
+					tidquals = list_make1(make_orclause(tidquals));
+				show_scan_qual(tidquals, "TID Cond", planstate, ancestors, es);
+				show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+				if (plan->qual)
+					show_instrumentation_count("Rows Removed by Filter", 1,
+											   planstate, es);
+			}
+			break;
+		case T_TidRangeScan:
+			{
+				/*
+				 * The tidrangequals list has AND semantics, so be sure to
+				 * show it as an AND condition.
+				 */
+				List	   *tidquals = ((TidRangeScan *) plan)->tidrangequals;
+
+				if (list_length(tidquals) > 1)
+					tidquals = list_make1(make_andclause(tidquals));
+				show_scan_qual(tidquals, "TID Cond", planstate, ancestors, es);
+				show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+				if (plan->qual)
+					show_instrumentation_count("Rows Removed by Filter", 1,
+											   planstate, es);
+			}
+			break;
+		case T_ForeignScan:
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			show_foreignscan_info((ForeignScanState *) planstate, es);
+			break;
+		case T_CustomScan:
+			{
+				CustomScanState *css = (CustomScanState *) planstate;
+
+				show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+				if (plan->qual)
+					show_instrumentation_count("Rows Removed by Filter", 1,
+											   planstate, es);
+				if (css->methods->ExplainCustomScan)
+					css->methods->ExplainCustomScan(css, ancestors, es);
+			}
+			break;
+		case T_NestLoop:
+			show_upper_qual(((NestLoop *) plan)->join.joinqual,
+							"Join Filter", planstate, ancestors, es);
+			if (((NestLoop *) plan)->join.joinqual)
+				show_instrumentation_count("Rows Removed by Join Filter", 1,
+										   planstate, es);
+			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 2,
+										   planstate, es);
+			break;
+		case T_MergeJoin:
+			show_upper_qual(((MergeJoin *) plan)->mergeclauses,
+							"Merge Cond", planstate, ancestors, es);
+			show_upper_qual(((MergeJoin *) plan)->join.joinqual,
+							"Join Filter", planstate, ancestors, es);
+			if (((MergeJoin *) plan)->join.joinqual)
+				show_instrumentation_count("Rows Removed by Join Filter", 1,
+										   planstate, es);
+			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 2,
+										   planstate, es);
+			break;
+		case T_HashJoin:
+		{
+			HashJoin *hash_join = (HashJoin *) plan;
+			/*
+			 * In the case of an "IS NOT DISTINCT" condition, we display
+			 * hashqualclauses instead of hashclauses.
+			 */
+			List *cond_to_show = hash_join->hashclauses;
+			if (list_length(hash_join->hashqualclauses) > 0)
+				cond_to_show = hash_join->hashqualclauses;
+
+			show_upper_qual(cond_to_show,
+							"Hash Cond", planstate, ancestors, es);
+			show_upper_qual(((HashJoin *) plan)->join.joinqual,
+							"Join Filter", planstate, ancestors, es);
+			if (((HashJoin *) plan)->join.joinqual)
+				show_instrumentation_count("Rows Removed by Join Filter", 1,
+										   planstate, es);
+			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 2,
+										   planstate, es);
+			break;
+		}
+		case T_TupleSplit:
+			show_tuple_split_keys((TupleSplitState *)planstate, ancestors, es);
+			break;
+		case T_Agg:
+			show_agg_keys(castNode(AggState, planstate), ancestors, es);
+			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			show_hashagg_info((AggState *) planstate, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+#if 0 /* Group node has been disabled in GPDB */
+		case T_Group:
+			show_group_keys(castNode(GroupState, planstate), ancestors, es);
+			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+#endif
+		case T_WindowAgg:
+			show_windowagg_keys((WindowAggState *) planstate, ancestors, es);
+			break;
+		case T_TableFunctionScan:
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			/* TODO: Partitioning and ordering information */
+			break;
+		case T_Unique:
+			show_motion_keys(planstate,
+                             NIL,
+						     ((Unique *) plan)->numCols,
+						     ((Unique *) plan)->uniqColIdx,
+						     "Group Key",
+						     ancestors, es);
+			break;
+		case T_Sort:
+			show_sort_keys(castNode(SortState, planstate), ancestors, es);
+			show_sort_info(castNode(SortState, planstate), es);
+			break;
+		case T_IncrementalSort:
+			show_incremental_sort_keys(castNode(IncrementalSortState, planstate),
+									   ancestors, es);
+			show_incremental_sort_info(castNode(IncrementalSortState, planstate),
+									   es);
+			break;
+		case T_MergeAppend:
+			show_merge_append_keys(castNode(MergeAppendState, planstate),
+								   ancestors, es);
+			break;
+		case T_Result:
+			show_upper_qual((List *) ((Result *) plan)->resconstantqual,
+							"One-Time Filter", planstate, ancestors, es);
+			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+		case T_ModifyTable:
+			show_modifytable_info(castNode(ModifyTableState, planstate), ancestors,
+								  es);
+			break;
+		case T_Hash:
+			show_hash_info(castNode(HashState, planstate), es);
+			break;
+		case T_RuntimeFilter:
+			show_runtime_filter_info(castNode(RuntimeFilterState, planstate),
+									 es);
+			break;
+		case T_Motion:
+			{
+				Motion	   *pMotion = (Motion *) plan;
+
+				if (pMotion->sendSorted || pMotion->motionType == MOTIONTYPE_HASH)
+					show_motion_keys(planstate,
+									 pMotion->hashExprs,
+									 pMotion->numSortCols,
+									 pMotion->sortColIdx,
+									 "Merge Key",
+									 ancestors, es);
+				if (pMotion->motionType == MOTIONTYPE_HASH &&
+					pMotion->numHashSegments != motion_recv)
+				{
+					Assert(pMotion->numHashSegments < motion_recv);
+					appendStringInfoSpaces(es->str, es->indent * 2);
+					appendStringInfo(es->str,
+									 "Hash Module: %d\n",
+									 pMotion->numHashSegments);
+				}
+			}
+			break;
+		case T_AssertOp:
+			show_upper_qual(plan->qual, "Assert Cond", planstate, ancestors, es);
+			break;
+		case T_Append:
+			show_join_pruning_info(((Append *) plan)->join_prune_paramids, es);
+			break;
+		case T_Memoize:
+			show_memoize_info(castNode(MemoizeState, planstate), ancestors,
+							  es);
+			break;
+		default:
+			break;
+	}
+
+    /* Show executor statistics */
+	if (planstate->instrument && planstate->instrument->need_cdb && !es->runtime)
+		cdbexplain_showExecStats(planstate, es);
+
+	/*
+	 * Prepare per-worker JIT instrumentation.  As with the overall JIT
+	 * summary, this is printed only if printing costs is enabled.
+	 */
+	if (es->workers_state && es->costs && es->verbose)
+	{
+		SharedJitInstrumentation *w = planstate->worker_jit_instrument;
+
+		if (w)
+		{
+			for (int n = 0; n < w->num_workers; n++)
+			{
+				ExplainOpenWorker(n, es);
+				ExplainPrintJIT(es, planstate->state->es_jit_flags,
+								&w->jit_instr[n]);
+				ExplainCloseWorker(n, es);
+			}
+		}
+	}
+
+	/* Show buffer/WAL usage */
+	if (es->buffers && planstate->instrument)
+		show_buffer_usage(es, &planstate->instrument->bufusage, false);
+	if (es->wal && planstate->instrument)
+		show_wal_usage(es, &planstate->instrument->walusage);
+
+	/* Prepare per-worker buffer/WAL usage */
+	if (es->workers_state && (es->buffers || es->wal) && es->verbose && !es->runtime)
+	{
+		WorkerInstrumentation *w = planstate->worker_instrument;
+
+		for (int n = 0; n < w->num_workers; n++)
+		{
+			Instrumentation *instrument = &w->instrument[n];
+			double		nloops = instrument->nloops;
+
+			if (nloops <= 0)
+				continue;
+
+			ExplainOpenWorker(n, es);
+			if (es->buffers)
+				show_buffer_usage(es, &instrument->bufusage, false);
+			if (es->wal)
+				show_wal_usage(es, &instrument->walusage);
+			ExplainCloseWorker(n, es);
+		}
+	}
+
+	/* Show per-worker details for this plan node, then pop that stack */
+	if (es->workers_state)
+		ExplainFlushWorkersState(es);
+	es->workers_state = save_workers_state;
+
+	/*
+	 * If partition pruning was done during executor initialization, the
+	 * number of child plans we'll display below will be less than the number
+	 * of subplans that was specified in the plan.  To make this a bit less
+	 * mysterious, emit an indication that this happened.  Note that this
+	 * field is emitted now because we want it to be a property of the parent
+	 * node; it *cannot* be emitted within the Plans sub-node we'll open next.
+	 */
+	switch (nodeTag(plan))
+	{
+		case T_Append:
+			ExplainMissingMembers(((AppendState *) planstate)->as_nplans,
+								  list_length(((Append *) plan)->appendplans),
+								  es);
+			break;
+		case T_MergeAppend:
+			ExplainMissingMembers(((MergeAppendState *) planstate)->ms_nplans,
+								  list_length(((MergeAppend *) plan)->mergeplans),
+								  es);
+			break;
+		default:
+			break;
+	}
+
+	/* Get ready to display the child plans */
+	haschildren = planstate->initPlan ||
+		outerPlanState(planstate) ||
+		innerPlanState(planstate) ||
+		IsA(plan, Append) ||
+		IsA(plan, MergeAppend) ||
+		IsA(plan, Sequence) ||
+		IsA(plan, BitmapAnd) ||
+		IsA(plan, BitmapOr) ||
+		IsA(plan, SubqueryScan) ||
+		(IsA(planstate, CustomScanState) &&
+		 ((CustomScanState *) planstate)->custom_ps != NIL) ||
+		planstate->subPlan;
+	if (haschildren)
+	{
+		ExplainOpenGroup("Plans", "Plans", false, es);
+		/* Pass current Plan as head of ancestors list for children */
+		ancestors = lcons(plan, ancestors);
+	}
+
+	/* initPlan-s */
+	if (plan->initPlan)
+		VecExplainSubPlans(planstate->initPlan, ancestors, "InitPlan", es, planstate->state->es_sliceTable);
+
+	/* lefttree */
+	if (outerPlan(plan) && !skip_outer)
+	{
+		VecExplainNode(outerPlanState(planstate), ancestors,
+					"Outer", NULL, es);
+	}
+    else if (skip_outer)
+    {
+		appendStringInfoSpaces(es->str, es->indent * 2);
+		appendStringInfo(es->str, "  ->  ");
+		appendStringInfoString(es->str, skip_outer_msg);
+		appendStringInfo(es->str, "\n");
+    }
+
+	/* righttree */
+	if (innerPlanState(planstate))
+		VecExplainNode(innerPlanState(planstate), ancestors,
+					"Inner", NULL, es);
+
+	/* special child plans */
+	switch (nodeTag(plan))
+	{
+		case T_Append:
+			ExplainMemberNodes(((AppendState *) planstate)->appendplans,
+							   ((AppendState *) planstate)->as_nplans,
+							   ancestors, es);
+			break;
+		case T_MergeAppend:
+			ExplainMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+							   ((MergeAppendState *) planstate)->ms_nplans,
+							   ancestors, es);
+			break;
+		case T_Sequence:
+			ExplainMemberNodes(((SequenceState *) planstate)->subplans,
+							   ((SequenceState *) planstate)->numSubplans,
+							   ancestors, es);
+			break;
+		case T_BitmapAnd:
+			ExplainMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+							   ((BitmapAndState *) planstate)->nplans,
+							   ancestors, es);
+			break;
+		case T_BitmapOr:
+			ExplainMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+							   ((BitmapOrState *) planstate)->nplans,
+							   ancestors, es);
+			break;
+		case T_SubqueryScan:
+			VecExplainNode(((SubqueryScanState *) planstate)->subplan, ancestors,
+						"Subquery", NULL, es);
+			break;
+		case T_CustomScan:
+			ExplainCustomChildren((CustomScanState *) planstate,
+								  ancestors, es);
+			break;
+		default:
+			break;
+	}
+
+	/* subPlan-s */
+	if (planstate->subPlan)
+		VecExplainSubPlans(planstate->subPlan, ancestors, "SubPlan", es, NULL);
+
+	/* end of child plans */
+	if (haschildren)
+	{
+		ancestors = list_delete_first(ancestors);
+		ExplainCloseGroup("Plans", "Plans", false, es);
+	}
+
+	/* in text format, undo whatever indentation we added */
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+		es->indent = save_indent;
+
+	ExplainCloseGroup("Plan",
+					  relationship ? NULL : "Plan",
+					  true, es);
+
+	es->currentSlice = save_currentSlice;
+}
+
+
+/*
+ * Explain a list of SubPlans (or initPlans, which also use SubPlan nodes).
+ *
+ * The ancestors list should already contain the immediate parent of these
+ * SubPlans.
+ */
+static void
+VecExplainSubPlans(List *plans, List *ancestors,
+				   const char *relationship, ExplainState *es,
+				   SliceTable *sliceTable)
+{
+	ListCell   *lst;
+	ExecSlice  *saved_slice = es->currentSlice;
+
+	foreach(lst, plans)
+	{
+		SubPlanState *sps = (SubPlanState *) lfirst(lst);
+        SubPlan    *sp = sps->subplan;
+		int			qDispSliceId;
+
+		if (es->pstmt->subplan_sliceIds)
+			qDispSliceId = es->pstmt->subplan_sliceIds[sp->plan_id - 1];
+		else
+			qDispSliceId = -1;
+
+		/*
+		 * There can be multiple SubPlan nodes referencing the same physical
+		 * subplan (same plan_id, which is its index in PlannedStmt.subplans).
+		 * We should print a subplan only once, so track which ones we already
+		 * printed.  This state must be global across the plan tree, since the
+		 * duplicate nodes could be in different plan nodes, eg both a bitmap
+		 * indexscan's indexqual and its parent heapscan's recheck qual.  (We
+		 * do not worry too much about which plan node we show the subplan as
+		 * attached to in such cases.)
+		 */
+		if (bms_is_member(sp->plan_id, es->printed_subplans))
+			continue;
+		es->printed_subplans = bms_add_member(es->printed_subplans,
+											  sp->plan_id);
+
+		/* Subplan might have its own root slice */
+		if (sliceTable && qDispSliceId > 0)
+		{
+			es->currentSlice = &sliceTable->slices[qDispSliceId];
+			es->subplanDispatchedSeparately = true;
+		}
+		else
+			es->subplanDispatchedSeparately = false;
+
+		if (sps->planstate == NULL)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfo(es->str, "  ->  ");
+			appendStringInfo(es->str, "UNUSED %s", sp->plan_name);
+			appendStringInfo(es->str, "\n");
+		}
+		else
+		{
+			/*
+			 * Treat the SubPlan node as an ancestor of the plan node(s) within
+			 * it, so that ruleutils.c can find the referents of subplan
+			 * parameters.
+			 */
+			ancestors = lcons(sp, ancestors);
+
+			VecExplainNode(sps->planstate, ancestors,
+						   relationship, sp->plan_name, es);
+
+			ancestors = list_delete_first(ancestors);
+		}
+	}
+
+	es->currentSlice = saved_slice;
+}
+
+/*
+ * VecExplainPrintSettings -
+ *    Print summary of modified settings affecting query planning.
+ */
+static void
+VecExplainPrintSettings(ExplainState *es, PlanGenerator planGen)
+{
+	int			num;
+	struct config_generic **gucs;
+
+	/* request an array of relevant settings */
+	gucs = get_explain_guc_options(&num, es->verbose, es->settings);
+
+	if (es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		VecExplainOpenGroup("Settings", "Settings", true, es);
+
+		if (planGen == PLANGEN_PLANNER)
+			ExplainPropertyStringInfo("Optimizer", es, "Postgres query optimizer");
+#ifdef USE_ORCA
+		else
+			ExplainPropertyStringInfo("Optimizer", es, "Pivotal Optimizer (GPORCA)");
+#endif
+
+		for (int i = 0; i < num; i++)
+		{
+			char	   *setting;
+			struct config_generic *conf = gucs[i];
+
+			setting = GetConfigOptionByName(conf->name, NULL, true);
+
+			ExplainPropertyText(conf->name, setting, es);
+		}
+
+		VecExplainCloseGroup("Settings", "Settings", true, es);
+	}
+	else
+	{
+		StringInfoData str;
+
+		if (num <= 0)
+			return;
+
+		initStringInfo(&str);
+
+		for (int i = 0; i < num; i++)
+		{
+			char	   *setting;
+			struct config_generic *conf = gucs[i];
+
+			if (i > 0)
+				appendStringInfoString(&str, ", ");
+
+			setting = GetConfigOptionByName(conf->name, NULL, true);
+
+			if (setting)
+				appendStringInfo(&str, "%s = '%s'", conf->name, setting);
+			else
+				appendStringInfo(&str, "%s = NULL", conf->name);
+		}
+
+		ExplainPropertyText("Settings", str.data, es);
+	}
+}
+
+/*
+ * Print per-worker info for current node, then free the ExplainWorkersState.
+ */
+static void
+ExplainFlushWorkersState(ExplainState *es)
+{
+	ExplainWorkersState *wstate = es->workers_state;
+
+	VecExplainOpenGroup("Workers", "Workers", false, es);
+	for (int i = 0; i < wstate->num_workers; i++)
+	{
+		if (wstate->worker_inited[i])
+		{
+			/* This must match previous ExplainOpenSetAsideGroup call */
+			VecExplainOpenGroup("Worker", NULL, true, es);
+			appendStringInfoString(es->str, wstate->worker_str[i].data);
+			VecExplainCloseGroup("Worker", NULL, true, es);
+
+			pfree(wstate->worker_str[i].data);
+		}
+	}
+	VecExplainCloseGroup("Workers", "Workers", false, es);
+
+	pfree(wstate->worker_inited);
+	pfree(wstate->worker_str);
+	pfree(wstate->worker_state_save);
+	pfree(wstate);
+}
+
+/*
+ * Explain the constituent plans of an Append, MergeAppend,
+ * BitmapAnd, or BitmapOr node.
+ *
+ * The ancestors list should already contain the immediate parent of these
+ * plans.
+ */
+static void
+ExplainMemberNodes(PlanState **planstates, int nplans,
+				   List *ancestors, ExplainState *es)
+{
+	int			j;
+
+	for (j = 0; j < nplans; j++)
+		VecExplainNode(planstates[j], ancestors,
+					"Member", NULL, es);
+}
+
+/*
+ * Report about any pruned subnodes of an Append or MergeAppend node.
+ *
+ * nplans indicates the number of live subplans.
+ * nchildren indicates the original number of subnodes in the Plan;
+ * some of these may have been pruned by the run-time pruning code.
+ */
+static void
+ExplainMissingMembers(int nplans, int nchildren, ExplainState *es)
+{
+	if (nplans < nchildren || es->format != EXPLAIN_FORMAT_TEXT)
+		ExplainPropertyInteger("Subplans Removed", NULL,
+							   nchildren - nplans, es);
+}
+
+/*
+ * ExplainPrintJIT -
+ *	  Append information about JITing to es->str.
+ */
+static void
+ExplainPrintJIT(ExplainState *es, int jit_flags, JitInstrumentation *ji)
+{
+	instr_time	total_time;
+
+	/* don't print information if no JITing happened */
+	if (!ji || ji->created_functions == 0)
+		return;
+
+	/* calculate total time */
+	INSTR_TIME_SET_ZERO(total_time);
+	INSTR_TIME_ADD(total_time, ji->generation_counter);
+	INSTR_TIME_ADD(total_time, ji->inlining_counter);
+	INSTR_TIME_ADD(total_time, ji->optimization_counter);
+	INSTR_TIME_ADD(total_time, ji->emission_counter);
+
+	VecExplainOpenGroup("JIT", "JIT", true, es);
+
+	/* for higher density, open code the text output format */
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		ExplainIndentText(es);
+		appendStringInfoString(es->str, "JIT:\n");
+		es->indent++;
+
+		ExplainPropertyInteger("Functions", NULL, ji->created_functions, es);
+
+		ExplainIndentText(es);
+		appendStringInfo(es->str, "Options: %s %s, %s %s, %s %s, %s %s\n",
+						 "Inlining", jit_flags & PGJIT_INLINE ? "true" : "false",
+						 "Optimization", jit_flags & PGJIT_OPT3 ? "true" : "false",
+						 "Expressions", jit_flags & PGJIT_EXPR ? "true" : "false",
+						 "Deforming", jit_flags & PGJIT_DEFORM ? "true" : "false");
+
+		if (es->analyze && es->timing)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+							 "Timing: %s %.3f ms, %s %.3f ms, %s %.3f ms, %s %.3f ms, %s %.3f ms\n",
+							 "Generation", 1000.0 * INSTR_TIME_GET_DOUBLE(ji->generation_counter),
+							 "Inlining", 1000.0 * INSTR_TIME_GET_DOUBLE(ji->inlining_counter),
+							 "Optimization", 1000.0 * INSTR_TIME_GET_DOUBLE(ji->optimization_counter),
+							 "Emission", 1000.0 * INSTR_TIME_GET_DOUBLE(ji->emission_counter),
+							 "Total", 1000.0 * INSTR_TIME_GET_DOUBLE(total_time));
+		}
+
+		es->indent--;
+	}
+	else
+	{
+		ExplainPropertyInteger("Functions", NULL, ji->created_functions, es);
+
+		VecExplainOpenGroup("Options", "Options", true, es);
+		ExplainPropertyBool("Inlining", jit_flags & PGJIT_INLINE, es);
+		ExplainPropertyBool("Optimization", jit_flags & PGJIT_OPT3, es);
+		ExplainPropertyBool("Expressions", jit_flags & PGJIT_EXPR, es);
+		ExplainPropertyBool("Deforming", jit_flags & PGJIT_DEFORM, es);
+		VecExplainCloseGroup("Options", "Options", true, es);
+
+		if (es->analyze && es->timing)
+		{
+			VecExplainOpenGroup("Timing", "Timing", true, es);
+
+			ExplainPropertyFloat("Generation", "ms",
+								 1000.0 * INSTR_TIME_GET_DOUBLE(ji->generation_counter),
+								 3, es);
+			ExplainPropertyFloat("Inlining", "ms",
+								 1000.0 * INSTR_TIME_GET_DOUBLE(ji->inlining_counter),
+								 3, es);
+			ExplainPropertyFloat("Optimization", "ms",
+								 1000.0 * INSTR_TIME_GET_DOUBLE(ji->optimization_counter),
+								 3, es);
+			ExplainPropertyFloat("Emission", "ms",
+								 1000.0 * INSTR_TIME_GET_DOUBLE(ji->emission_counter),
+								 3, es);
+			ExplainPropertyFloat("Total", "ms",
+								 1000.0 * INSTR_TIME_GET_DOUBLE(total_time),
+								 3, es);
+
+			VecExplainCloseGroup("Timing", "Timing", true, es);
+		}
+	}
+
+	VecExplainCloseGroup("JIT", "JIT", true, es);
+}
+/*
+ * Begin or resume output into the set-aside group for worker N.
+ */
+static void
+ExplainOpenWorker(int n, ExplainState *es)
+{
+	ExplainWorkersState *wstate = es->workers_state;
+
+	Assert(wstate);
+	Assert(n >= 0 && n < wstate->num_workers);
+
+	/* Save prior output buffer pointer */
+	wstate->prev_str = es->str;
+
+	if (!wstate->worker_inited[n])
+	{
+		/* First time through, so create the buffer for this worker */
+		initStringInfo(&wstate->worker_str[n]);
+		es->str = &wstate->worker_str[n];
+
+		/*
+		 * Push suitable initial formatting state for this worker's field
+		 * group.  We allow one extra logical nesting level, since this group
+		 * will eventually be wrapped in an outer "Workers" group.
+		 */
+		ExplainOpenSetAsideGroup("Worker", NULL, true, 2, es);
+
+		/*
+		 * In non-TEXT formats we always emit a "Worker Number" field, even if
+		 * there's no other data for this worker.
+		 */
+		if (es->format != EXPLAIN_FORMAT_TEXT)
+			ExplainPropertyInteger("Worker Number", NULL, n, es);
+
+		wstate->worker_inited[n] = true;
+	}
+	else
+	{
+		/* Resuming output for a worker we've already emitted some data for */
+		es->str = &wstate->worker_str[n];
+
+		/* Restore formatting state saved by last ExplainCloseWorker() */
+		ExplainRestoreGroup(es, 2, &wstate->worker_state_save[n]);
+	}
+
+	/*
+	 * In TEXT format, prefix the first output line for this worker with
+	 * "Worker N:".  Then, any additional lines should be indented one more
+	 * stop than the "Worker N" line is.
+	 */
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		if (es->str->len == 0)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Worker %d:  ", n);
+		}
+
+		es->indent++;
+	}
+}
+
+/*
+ * End output for worker N --- must pair with previous ExplainOpenWorker call
+ */
+static void
+ExplainCloseWorker(int n, ExplainState *es)
+{
+	ExplainWorkersState *wstate = es->workers_state;
+
+	Assert(wstate);
+	Assert(n >= 0 && n < wstate->num_workers);
+	Assert(wstate->worker_inited[n]);
+
+	/*
+	 * Save formatting state in case we do another ExplainOpenWorker(), then
+	 * pop the formatting stack.
+	 */
+	ExplainSaveGroup(es, 2, &wstate->worker_state_save[n]);
+
+	/*
+	 * In TEXT format, if we didn't actually produce any output line(s) then
+	 * truncate off the partial line emitted by ExplainOpenWorker.  (This is
+	 * to avoid bogus output if, say, show_buffer_usage chooses not to print
+	 * anything for the worker.)  Also fix up the indent level.
+	 */
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		while (es->str->len > 0 && es->str->data[es->str->len - 1] != '\n')
+			es->str->data[--(es->str->len)] = '\0';
+
+		es->indent--;
+	}
+
+	/* Restore prior output buffer pointer */
+	es->str = wstate->prev_str;
+}
+/*
+ * Pop one level of grouping state, allowing for a re-push later.
+ *
+ * This is typically used after ExplainOpenSetAsideGroup; pass the
+ * same "depth" used for that.
+ *
+ * This should not emit any output.  If state needs to be saved,
+ * save it at *state_save.  Currently, an integer save area is sufficient
+ * for all formats, but we might need to revisit that someday.
+ */
+static void
+ExplainSaveGroup(ExplainState *es, int depth, int *state_save)
+{
+	switch (es->format)
+	{
+		case EXPLAIN_FORMAT_TEXT:
+			/* nothing to do */
+			break;
+
+		case EXPLAIN_FORMAT_XML:
+			es->indent -= depth;
+			break;
+
+		case EXPLAIN_FORMAT_JSON:
+			es->indent -= depth;
+			*state_save = linitial_int(es->grouping_stack);
+			es->grouping_stack = list_delete_first(es->grouping_stack);
+			break;
+
+		case EXPLAIN_FORMAT_YAML:
+			es->indent -= depth;
+			*state_save = linitial_int(es->grouping_stack);
+			es->grouping_stack = list_delete_first(es->grouping_stack);
+			break;
+	}
+}
+
+/*
+ * Re-push one level of grouping state, undoing the effects of ExplainSaveGroup.
+ */
+static void
+ExplainRestoreGroup(ExplainState *es, int depth, int *state_save)
+{
+	switch (es->format)
+	{
+		case EXPLAIN_FORMAT_TEXT:
+			/* nothing to do */
+			break;
+
+		case EXPLAIN_FORMAT_XML:
+			es->indent += depth;
+			break;
+
+		case EXPLAIN_FORMAT_JSON:
+			es->grouping_stack = lcons_int(*state_save, es->grouping_stack);
+			es->indent += depth;
+			break;
+
+		case EXPLAIN_FORMAT_YAML:
+			es->grouping_stack = lcons_int(*state_save, es->grouping_stack);
+			es->indent += depth;
+			break;
+	}
+}
+
+/*
+ * Open a group of related objects, without emitting actual data.
+ *
+ * Prepare the formatting state as though we were beginning a group with
+ * the identified properties, but don't actually emit anything.  Output
+ * subsequent to this call can be redirected into a separate output buffer,
+ * and then eventually appended to the main output buffer after doing a
+ * regular VecExplainOpenGroup call (with the same parameters).
+ *
+ * The extra "depth" parameter is the new group's depth compared to current.
+ * It could be more than one, in case the eventual output will be enclosed
+ * in additional nesting group levels.  We assume we don't need to track
+ * formatting state for those levels while preparing this group's output.
+ *
+ * There is no ExplainCloseSetAsideGroup --- in current usage, we always
+ * pop this state with ExplainSaveGroup.
+ */
+static void
+ExplainOpenSetAsideGroup(const char *objtype, const char *labelname,
+						 bool labeled, int depth, ExplainState *es)
+{
+	switch (es->format)
+	{
+		case EXPLAIN_FORMAT_TEXT:
+			/* nothing to do */
+			break;
+
+		case EXPLAIN_FORMAT_XML:
+			es->indent += depth;
+			break;
+
+		case EXPLAIN_FORMAT_JSON:
+			es->grouping_stack = lcons_int(0, es->grouping_stack);
+			es->indent += depth;
+			break;
+
+		case EXPLAIN_FORMAT_YAML:
+			if (labelname)
+				es->grouping_stack = lcons_int(1, es->grouping_stack);
+			else
+				es->grouping_stack = lcons_int(0, es->grouping_stack);
+			es->indent += depth;
+			break;
+	}
+}
+
+/*
+ * cdbexplain_showExecStats
+ *	  Called by qDisp process to format a node's EXPLAIN ANALYZE statistics.
+ *
+ * 'planstate' is the node whose statistics are to be displayed.
+ * 'str' is the output buffer.
+ * 'indent' is the root indentation for all the text generated for explain output
+ * 'ctx' is a CdbExplain_ShowStatCtx object which was created by a call to
+ *		cdbexplain_showExecStatsBegin().
+ */
+static void
+cdbexplain_showExecStats(struct PlanState *planstate, ExplainState *es)
+{
+	struct CdbExplain_ShowStatCtx *ctx = es->showstatctx;
+	Instrumentation *instr = planstate->instrument;
+	CdbExplain_NodeSummary *ns = es->runtime? instr->rt_cdbNodeSummary : instr->cdbNodeSummary;
+	instr_time	timediff;
+	int			i;
+
+	char		totalbuf[50];
+	char		avgbuf[50];
+	char		maxbuf[50];
+	char		segbuf[50];
+	char		startbuf[50];
+
+	/* Might not have received stats from qExecs if they hit errors. */
+	if (!ns)
+		return;
+
+	Assert(instr != NULL);
+
+	/*
+	 * Executor memory used by this individual node, if it allocates from a
+	 * memory context of its own instead of sharing the per-query context.
+	 */
+	if (es->analyze && ns->execmemused.vcnt > 0)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfo(es->str, "Executor Memory: %ldkB  Segments: %d  Max: %ldkB (segment %d)\n",
+							 (long) kb(ns->execmemused.vsum),
+							 ns->execmemused.vcnt,
+							 (long) kb(ns->execmemused.vmax),
+							 ns->execmemused.imax);
+		}
+		else
+		{
+			ExplainPropertyInteger("Executor Memory", "kB", kb(ns->execmemused.vsum), es);
+			ExplainPropertyInteger("Executor Memory Segments", NULL, ns->execmemused.vcnt, es);
+			ExplainPropertyInteger("Executor Max Memory", "kB", kb(ns->execmemused.vmax), es);
+			ExplainPropertyInteger("Executor Max Memory Segment", NULL, ns->execmemused.imax, es);
+		}
+	}
+
+	/*
+	 * Actual work_mem used and wanted
+	 */
+	if (es->analyze && es->verbose && ns->workmemused.vcnt > 0)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfo(es->str, "work_mem: %ldkB  Segments: %d  Max: %ldkB (segment %d)",
+							 (long) kb(ns->workmemused.vsum),
+							 ns->workmemused.vcnt,
+							 (long) kb(ns->workmemused.vmax),
+							 ns->workmemused.imax);
+
+			/*
+			 * Total number of segments in which this node reuses cached or
+			 * creates workfiles.
+			 */
+			if (nodeSupportWorkfileCaching(planstate))
+				appendStringInfo(es->str, "  Workfile: (%d spilling)",
+								 ns->totalWorkfileCreated.vcnt);
+
+			appendStringInfo(es->str, "\n");
+
+			if (ns->workmemwanted.vcnt > 0)
+			{
+				appendStringInfoSpaces(es->str, es->indent * 2);
+				cdbexplain_formatMemory(maxbuf, sizeof(maxbuf), ns->workmemwanted.vmax);
+				if (ns->ninst == 1)
+				{
+					appendStringInfo(es->str,
+								 "Work_mem wanted: %s to lessen workfile I/O.",
+								 maxbuf);
+				}
+				else
+				{
+					cdbexplain_formatMemory(avgbuf, sizeof(avgbuf), cdbexplain_agg_avg(&ns->workmemwanted));
+					cdbexplain_formatSeg(segbuf, sizeof(segbuf), ns->workmemwanted.imax, ns->ninst);
+					appendStringInfo(es->str,
+									 "Work_mem wanted: %s avg, %s max%s"
+									 " to lessen workfile I/O affecting %d workers.",
+									 avgbuf, maxbuf, segbuf, ns->workmemwanted.vcnt);
+				}
+
+				appendStringInfo(es->str, "\n");
+			}
+		}
+		else
+		{
+			VecExplainOpenGroup("work_mem", "work_mem", true, es);
+			ExplainPropertyInteger("Used", "kB", kb(ns->workmemused.vsum), es);
+			ExplainPropertyInteger("Segments", NULL, ns->workmemused.vcnt, es);
+			ExplainPropertyInteger("Max Memory", "kB", kb(ns->workmemused.vmax), es);
+			ExplainPropertyInteger("Max Memory Segment", NULL, ns->workmemused.imax, es);
+
+			/*
+			 * Total number of segments in which this node reuses cached or
+			 * creates workfiles.
+			 */
+			if (nodeSupportWorkfileCaching(planstate))
+				ExplainPropertyInteger("Workfile Spilling", NULL, ns->totalWorkfileCreated.vcnt, es);
+
+			if (ns->workmemwanted.vcnt > 0)
+			{
+				ExplainPropertyInteger("Max Memory Wanted", "kB", kb(ns->workmemwanted.vmax), es);
+
+				if (ns->ninst > 1)
+				{
+					ExplainPropertyInteger("Max Memory Wanted Segment", NULL, ns->workmemwanted.imax, es);
+					ExplainPropertyInteger("Avg Memory Wanted", "kB", kb(cdbexplain_agg_avg(&ns->workmemwanted)), es);
+					ExplainPropertyInteger("Segments Affected", NULL, ns->ninst, es);
+				}
+			}
+
+			VecExplainCloseGroup("work_mem", "work_mem", true, es);
+		}
+	}
+
+	bool 			haveExtraText = false;
+	StringInfoData	extraData;
+
+	initStringInfo(&extraData);
+
+	for (i = 0; i < ns->ninst; i++)
+	{
+		CdbExplain_StatInst *nsi = &ns->insts[i];
+
+		if (nsi->bnotes < nsi->enotes)
+		{
+			if (!haveExtraText)
+			{
+				VecExplainOpenGroup("Extra Text", "Extra Text", false, es);
+				VecExplainOpenGroup("Segment", NULL, true, es);
+				haveExtraText = true;
+			}
+			
+			resetStringInfo(&extraData);
+
+			cdbexplain_formatExtraText(&extraData,
+									   0,
+									   (ns->ninst == 1) ? -1
+									   : ns->segindex0 + i,
+									   ctx->extratextbuf.data + nsi->bnotes,
+									   nsi->enotes - nsi->bnotes);
+			ExplainPropertyStringInfo("Extra Text", es, "%s", extraData.data);
+		}
+	}
+
+	if (haveExtraText)
+	{
+		VecExplainCloseGroup("Segment", NULL, true, es);
+		VecExplainCloseGroup("Extra Text", "Extra Text", false, es);
+	}
+	pfree(extraData.data);
+
+	/*
+	 * Dump stats for all workers.
+	 */
+	if (gp_enable_explain_allstat && ns->segindex0 >= 0 && ns->ninst > 0)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			/*
+			 * create a header for all stats: separate each individual stat by an
+			 * underscore, separate the grouped stats for each node by a slash
+			 */
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfoString(es->str,
+								   "allstat: seg_firststart_total_ntuples");
+		}
+		else
+			VecExplainOpenGroup("Allstat", "Allstat", true, es);
+
+		for (i = 0; i < ns->ninst; i++)
+		{
+			CdbExplain_StatInst *nsi = &ns->insts[i];
+
+			if (INSTR_TIME_IS_ZERO(nsi->firststart))
+				continue;
+
+			/* Time from start of query on qDisp to worker's first result row */
+			INSTR_TIME_SET_ZERO(timediff);
+			INSTR_TIME_ACCUM_DIFF(timediff, nsi->firststart, ctx->querystarttime);
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				cdbexplain_formatSeconds(startbuf, sizeof(startbuf),
+										 INSTR_TIME_GET_DOUBLE(timediff), true);
+				cdbexplain_formatSeconds(totalbuf, sizeof(totalbuf),
+										 nsi->total, true);
+				appendStringInfo(es->str,
+								 "/seg%d_%s_%s_%.0f",
+								 ns->segindex0 + i,
+								 startbuf,
+								 totalbuf,
+								 nsi->ntuples);
+			}
+			else
+			{
+				cdbexplain_formatSeconds(startbuf, sizeof(startbuf),
+										 INSTR_TIME_GET_DOUBLE(timediff), false);
+				cdbexplain_formatSeconds(totalbuf, sizeof(totalbuf),
+										 nsi->total, false);
+
+				VecExplainOpenGroup("Segment", NULL, false, es);
+				ExplainPropertyInteger("Segment index", NULL, ns->segindex0 + i, es);
+				ExplainPropertyText("Time To First Result", startbuf, es);
+				ExplainPropertyText("Time To Total Result", totalbuf, es);
+				ExplainPropertyFloat("Tuples", NULL, nsi->ntuples, 1, es);
+				VecExplainCloseGroup("Segment", NULL, false, es);
+			}
+		}
+
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+			appendStringInfoString(es->str, "//end\n");
+		else
+			VecExplainCloseGroup("Allstat", "Allstat", true, es);
+	}
+}								/* cdbexplain_showExecStats */
+
+/*
+ * cdbexplain_formatMemory
+ *	  Convert memory size to string from (double) bytes.
+ *
+ *		outbuf:  [output] pointer to a char buffer to be filled
+ *		bufsize: [input] maximum number of characters to write to outbuf (must be set by the caller)
+ *		bytes:	 [input] a value representing memory size in bytes to be written to outbuf
+ */
+static void
+cdbexplain_formatMemory(char *outbuf, int bufsize, double bytes)
+{
+	Assert(outbuf != NULL && "CDBEXPLAIN: char buffer is null");
+	Assert(bufsize > 0 && "CDBEXPLAIN: size of char buffer is zero");
+	/* check if truncation occurs */
+#ifdef USE_ASSERT_CHECKING
+	int			nchars_written =
+#endif							/* USE_ASSERT_CHECKING */
+	snprintf(outbuf, bufsize, "%.0fK bytes", kb(bytes));
+
+	Assert(nchars_written < bufsize &&
+		   "CDBEXPLAIN:  size of char buffer is smaller than the required number of chars");
+}								/* cdbexplain_formatMemory */
+
+/*
+ * cdbexplain_formatSeg
+ *	  Convert segment id to string.
+ *
+ *		outbuf:  [output] pointer to a char buffer to be filled
+ *		bufsize: [input] maximum number of characters to write to outbuf (must be set by the caller)
+ *		segindex:[input] a value representing segment index to be written to outbuf
+ *		nInst:	 [input] no. of stat instances
+ */
+static void
+cdbexplain_formatSeg(char *outbuf, int bufsize, int segindex, int nInst)
+{
+	Assert(outbuf != NULL && "CDBEXPLAIN: char buffer is null");
+	Assert(bufsize > 0 && "CDBEXPLAIN: size of char buffer is zero");
+
+	if (nInst > 1 && segindex >= 0)
+	{
+		/* check if truncation occurs */
+#ifdef USE_ASSERT_CHECKING
+		int			nchars_written =
+#endif							/* USE_ASSERT_CHECKING */
+		snprintf(outbuf, bufsize, " (seg%d)", segindex);
+
+		Assert(nchars_written < bufsize &&
+			   "CDBEXPLAIN:  size of char buffer is smaller than the required number of chars");
+	}
+	else
+	{
+		outbuf[0] = '\0';
+	}
+}								/* cdbexplain_formatSeg */
+
+
+/*
+ * cdbexplain_formatSeconds
+ *	  Convert time in seconds to readable string
+ *
+ *		outbuf:  [output] pointer to a char buffer to be filled
+ *		bufsize: [input] maximum number of characters to write to outbuf (must be set by the caller)
+ *		seconds: [input] a value representing no. of seconds to be written to outbuf
+ */
+static void
+cdbexplain_formatSeconds(char *outbuf, int bufsize, double seconds, bool unit)
+{
+	Assert(outbuf != NULL && "CDBEXPLAIN: char buffer is null");
+	Assert(bufsize > 0 && "CDBEXPLAIN: size of char buffer is zero");
+	double		ms = seconds * 1000.0;
+
+	/* check if truncation occurs */
+#ifdef USE_ASSERT_CHECKING
+	int			nchars_written =
+#endif							/* USE_ASSERT_CHECKING */
+	snprintf(outbuf, bufsize, "%.*f%s",
+			 (ms < 10.0 && ms != 0.0 && ms > -10.0) ? 3 : 0,
+			 ms, (unit ? " ms" : ""));
+
+	Assert(nchars_written < bufsize &&
+		   "CDBEXPLAIN:  size of char buffer is smaller than the required number of chars");
+}								/* cdbexplain_formatSeconds */
+
+/*
+ * cdbexplain_formatExtraText
+ *	  Format extra message text into the EXPLAIN output buffer.
+ */
+static void
+cdbexplain_formatExtraText(StringInfo str,
+						   int indent,
+						   int segindex,
+						   const char *notes,
+						   int notelen)
+{
+	const char *cp = notes;
+	const char *ep = notes + notelen;
+
+	/* Could be more than one line... */
+	while (cp < ep)
+	{
+		const char *nlp = memchr(cp, '\n', ep - cp);
+		const char *dp = nlp ? nlp : ep;
+
+		/* Strip trailing whitespace. */
+		while (cp < dp &&
+			   isspace(dp[-1]))
+			dp--;
+
+		/* Add to output buffer. */
+		if (cp < dp)
+		{
+			appendStringInfoSpaces(str, indent * 2);
+			if (segindex >= 0)
+			{
+				appendStringInfo(str, "(seg%d) ", segindex);
+				if (segindex < 10)
+					appendStringInfoChar(str, ' ');
+				if (segindex < 100)
+					appendStringInfoChar(str, ' ');
+			}
+			appendBinaryStringInfo(str, cp, dp - cp);
+			if (nlp)
+				appendStringInfoChar(str, '\n');
+		}
+
+		if (!nlp)
+			break;
+		cp = nlp + 1;
+	}
+}								/* cdbexplain_formatExtraText */
+
+/*
+ * nodeSupportWorkfileCaching
+ *	 Return true if a given node supports workfile caching.
+ */
+static bool
+nodeSupportWorkfileCaching(PlanState *planstate)
+{
+	return (IsA(planstate, SortState) ||
+			IsA(planstate, HashJoinState) ||
+			(IsA(planstate, AggState) &&((Agg *) planstate->plan)->aggstrategy == AGG_HASHED) ||
+			IsA(planstate, MaterialState));
+}
+
+/*
+ * Indent a text-format line.
+ *
+ * We indent by two spaces per indentation level.  However, when emitting
+ * data for a parallel worker there might already be data on the current line
+ * (cf. ExplainOpenWorker); in that case, don't indent any more.
+ */
+static void
+ExplainIndentText(ExplainState *es)
+{
+	Assert(es->format == EXPLAIN_FORMAT_TEXT);
+	if (es->str->len == 0 || es->str->data[es->str->len - 1] == '\n')
+		appendStringInfoSpaces(es->str, es->indent * 2);
+}
+
+
+static void
+show_dispatch_info(ExecSlice *slice, ExplainState *es, Plan *plan)
+{
+	int			segments;
+
+	/*
+	 * In non-parallel query, there is no slice information.
+	 */
+	if (!slice)
+		return;
+
+	switch (slice->gangType)
+	{
+		case GANGTYPE_UNALLOCATED:
+		case GANGTYPE_ENTRYDB_READER:
+			segments = 0;
+			break;
+
+		case GANGTYPE_PRIMARY_WRITER:
+		case GANGTYPE_PRIMARY_READER:
+		case GANGTYPE_SINGLETON_READER:
+		{
+			segments = list_length(slice->segments);
+			break;
+		}
+
+		default:
+			segments = 0;		/* keep compiler happy */
+			Assert(false);
+			break;
+	}
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		if (segments == 0)
+			appendStringInfo(es->str, "  (slice%d)", slice->sliceIndex);
+		else if (slice->primaryGang && gp_log_gang >= GPVARS_VERBOSITY_DEBUG)
+			/*
+			 * In gpdb 5 there was a unique gang_id for each gang, this was
+			 * retired since gpdb 6, so we use the qe identifier from the first
+			 * segment of the gang to identify each gang.
+			 */
+			appendStringInfo(es->str, "  (slice%d; gang%d; segments: %d)",
+							 slice->sliceIndex,
+							 slice->primaryGang->db_descriptors[0]->identifier,
+							 segments);
+		else
+			appendStringInfo(es->str, "  (slice%d; segments: %d)",
+							 slice->sliceIndex, segments);
+	}
+	else
+	{
+		ExplainPropertyInteger("Slice", NULL, slice->sliceIndex, es);
+		if (slice->primaryGang && gp_log_gang >= GPVARS_VERBOSITY_DEBUG)
+			ExplainPropertyInteger("Gang", NULL, slice->primaryGang->db_descriptors[0]->identifier, es);
+		ExplainPropertyInteger("Segments", NULL, segments, es);
+		ExplainPropertyText("Gang Type", gangTypeToString(slice->gangType), es);
+	}
+}
+
+/*
+ * Create a per-plan-node workspace for collecting per-worker data.
+ *
+ * Output related to each worker will be temporarily "set aside" into a
+ * separate buffer, which we'll merge into the main output stream once
+ * we've processed all data for the plan node.  This makes it feasible to
+ * generate a coherent sub-group of fields for each worker, even though the
+ * code that produces the fields is in several different places in this file.
+ * Formatting of such a set-aside field group is managed by
+ * ExplainOpenSetAsideGroup and ExplainSaveGroup/ExplainRestoreGroup.
+ */
+static ExplainWorkersState *
+ExplainCreateWorkersState(int num_workers)
+{
+	ExplainWorkersState *wstate;
+
+	wstate = (ExplainWorkersState *) palloc(sizeof(ExplainWorkersState));
+	wstate->num_workers = num_workers;
+	wstate->worker_inited = (bool *) palloc0(num_workers * sizeof(bool));
+	wstate->worker_str = (StringInfoData *)
+		palloc0(num_workers * sizeof(StringInfoData));
+	wstate->worker_state_save = (int *) palloc(num_workers * sizeof(int));
+	return wstate;
+}
+/*
+ * Indent a YAML line.
+ *
+ * YAML lines are ordinarily indented by two spaces per indentation level.
+ * The text emitted for each property begins just prior to the preceding
+ * line-break, except for the first property in an unlabeled group, for which
+ * it begins immediately after the "- " that introduces the group.  The first
+ * property of the group appears on the same line as the opening "- ".
+ */
+static void
+ExplainYAMLLineStarting(ExplainState *es)
+{
+	Assert(es->format == EXPLAIN_FORMAT_YAML);
+	if (linitial_int(es->grouping_stack) == 0)
+	{
+		linitial_int(es->grouping_stack) = 1;
+	}
+	else
+	{
+		appendStringInfoChar(es->str, '\n');
+		appendStringInfoSpaces(es->str, es->indent * 2);
+	}
+}
+
+static void
+ExplainPropertyStringInfo(const char *qlabel, ExplainState *es, const char *fmt,...)
+{
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	for (;;)
+	{
+		va_list		args;
+		int			needed;
+
+		/* Try to format the data. */
+		va_start(args, fmt);
+		needed = appendStringInfoVA(&buf, fmt, args);
+		va_end(args);
+
+		if (needed == 0)
+			break;
+
+		/* Double the buffer size and try again. */
+		enlargeStringInfo(&buf, needed);
+	}
+
+	ExplainPropertyText(qlabel, buf.data, es);
+	pfree(buf.data);
+}
+
+/*
+ * Emit opening or closing XML tag.
+ *
+ * "flags" must contain X_OPENING, X_CLOSING, or X_CLOSE_IMMEDIATE.
+ * Optionally, OR in X_NOWHITESPACE to suppress the whitespace we'd normally
+ * add.
+ *
+ * XML restricts tag names more than our other output formats, eg they can't
+ * contain white space or slashes.  Replace invalid characters with dashes,
+ * so that for example "I/O Read Time" becomes "I-O-Read-Time".
+ */
+static void
+ExplainXMLTag(const char *tagname, int flags, ExplainState *es)
+{
+	const char *s;
+	const char *valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.";
+
+	if ((flags & X_NOWHITESPACE) == 0)
+		appendStringInfoSpaces(es->str, 2 * es->indent);
+	appendStringInfoCharMacro(es->str, '<');
+	if ((flags & X_CLOSING) != 0)
+		appendStringInfoCharMacro(es->str, '/');
+	for (s = tagname; *s; s++)
+		appendStringInfoChar(es->str, strchr(valid, *s) ? *s : '-');
+	if ((flags & X_CLOSE_IMMEDIATE) != 0)
+		appendStringInfoString(es->str, " /");
+	appendStringInfoCharMacro(es->str, '>');
+	if ((flags & X_NOWHITESPACE) == 0)
+		appendStringInfoCharMacro(es->str, '\n');
+}
+/*
+ * cdbexplain_showExecStatsEnd
+ *	  Called by qDisp process to format the overall statistics for a query
+ *	  into the caller's buffer.
+ *
+ * 'ctx' is the CdbExplain_ShowStatCtx object which was created by a call to
+ *		cdbexplain_showExecStatsBegin() and contains statistics which have
+ *		been accumulated over a series of calls to cdbexplain_showExecStats().
+ *		Invalid on return (it is freed).
+ *
+ * This doesn't free the CdbExplain_ShowStatCtx object or buffers, because
+ * they will be free'd shortly by the end of statement anyway.
+ */
+static void
+cdbexplain_showExecStatsEnd(struct PlannedStmt *stmt,
+							struct CdbExplain_ShowStatCtx *showstatctx,
+							struct EState *estate,
+							ExplainState *es)
+{
+	if (!es->summary)
+		return;
+
+    gpexplain_formatSlicesOutput(showstatctx, estate, es);
+
+	if (!IsResManagerMemoryPolicyNone())
+	{
+		ExplainOpenGroup("Statement statistics", "Statement statistics", true, es);
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+			appendStringInfo(es->str, "Memory used:  %ldkB\n", (long) kb(stmt->query_mem));
+		else
+			ExplainPropertyInteger("Memory used", "kB", kb(stmt->query_mem), es);
+
+		if (showstatctx->workmemwanted_max > 0)
+		{
+			long mem_wanted;
+
+			mem_wanted = (long) PolicyAutoStatementMemForNoSpill(stmt,
+							(uint64) showstatctx->workmemwanted_max);
+
+			/*
+			 * Round up to a kilobyte in case we end up requiring less than
+			 * that.
+			 */
+			if (mem_wanted <= 1024L)
+				mem_wanted = 1L;
+			else
+				mem_wanted = mem_wanted / 1024L;
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+				appendStringInfo(es->str, "Memory wanted:  %ldkB\n", mem_wanted);
+			else
+				ExplainPropertyInteger("Memory wanted", "kB", mem_wanted, es);
+		}
+
+		ExplainCloseGroup("Statement statistics", "Statement statistics", true, es);
+	}
+}								/* cdbexplain_showExecStatsEnd */
+
+/*
+ * Given a statistics context search for all the slice statistics
+ * and format them to the correct layout
+ */
+static void
+gpexplain_formatSlicesOutput(struct CdbExplain_ShowStatCtx *showstatctx,
+                             struct EState *estate,
+                             ExplainState *es)
+{
+	ExecSlice  *slice;
+	int			sliceIndex;
+	int			flag;
+	double		total_memory_across_slices = 0;
+
+	char		avgbuf[50];
+	char		maxbuf[50];
+	char		segbuf[50];
+
+    if (showstatctx->nslice > 0)
+        ExplainOpenGroup("Slice statistics", "Slice statistics", false, es);
+
+    for (sliceIndex = 0; sliceIndex < showstatctx->nslice; sliceIndex++)
+    {
+        CdbExplain_SliceSummary *ss = &showstatctx->slices[sliceIndex];
+        CdbExplain_DispatchSummary *ds = &ss->dispatchSummary;
+        
+        flag = es->str->len;
+        if (es->format == EXPLAIN_FORMAT_TEXT)
+        {
+
+            appendStringInfo(es->str, "  (slice%d) ", sliceIndex);
+            if (sliceIndex < 10)
+                appendStringInfoChar(es->str, ' ');
+
+            appendStringInfoString(es->str, "  ");
+        }
+        else 
+        {
+            ExplainOpenGroup("Slice", NULL, true, es);
+            ExplainPropertyInteger("Slice", NULL, sliceIndex, es);
+        }
+
+        /* Worker counts */
+        slice = getCurrentSlice(estate, sliceIndex);
+        if (slice &&
+			list_length(slice->segments) > 0 &&
+			list_length(slice->segments) != ss->dispatchSummary.nOk)
+        {
+			int			nNotDispatched;
+			StringInfoData workersInformationText;
+
+			nNotDispatched = list_length(slice->segments) - ds->nResult + ds->nNotDispatched;
+
+			es->str->data[flag] = (ss->dispatchSummary.nError > 0) ? 'X' : '_';
+
+			initStringInfo(&workersInformationText);
+			appendStringInfo(&workersInformationText, "Workers:");
+
+            if (es->format == EXPLAIN_FORMAT_TEXT)
+            {
+                if (ds->nError == 1)
+                {
+                    appendStringInfo(&workersInformationText,
+                                     " %d error;",
+                                     ds->nError);
+                }
+                else if (ds->nError > 1)
+                {
+                    appendStringInfo(&workersInformationText,
+                                     " %d errors;",
+                                     ds->nError);
+                }
+            }
+            else
+            {
+                ExplainOpenGroup("Workers", "Workers", true, es);
+                if (ds->nError > 0)
+                    ExplainPropertyInteger("Errors", NULL, ds->nError, es);
+            }
+
+            if (ds->nCanceled > 0)
+            {
+                if (es->format == EXPLAIN_FORMAT_TEXT)
+                {
+                    appendStringInfo(&workersInformationText,
+                                     " %d canceled;",
+                                     ds->nCanceled);
+                }
+                else
+                {
+                    ExplainPropertyInteger("Canceled", NULL, ds->nCanceled, es);
+                }
+            }
+
+            if (nNotDispatched > 0)
+            {
+                if (es->format == EXPLAIN_FORMAT_TEXT)
+                {
+                    appendStringInfo(&workersInformationText,
+                                     " %d not dispatched;",
+                                     nNotDispatched);
+                }
+                else
+                {
+                    ExplainPropertyInteger("Not Dispatched", NULL, nNotDispatched, es);
+                }
+            }
+
+            if (ds->nIgnorableError > 0)
+            {
+                if (es->format == EXPLAIN_FORMAT_TEXT)
+                {
+                    appendStringInfo(&workersInformationText,
+                                     " %d aborted;",
+                                     ds->nIgnorableError);
+                }
+                else
+                {
+                    ExplainPropertyInteger("Aborted", NULL, ds->nIgnorableError, es);
+                }
+            }
+
+            if (ds->nOk > 0)
+            {
+                if (es->format == EXPLAIN_FORMAT_TEXT)
+                {
+                    appendStringInfo(&workersInformationText,
+                                     " %d ok;",
+                                     ds->nOk);
+                }
+                else
+                {
+                    ExplainPropertyInteger("Ok", NULL, ds->nOk, es);
+                }
+            }
+
+            if (es->format == EXPLAIN_FORMAT_TEXT)
+            {
+                workersInformationText.len--;
+                ExplainPropertyStringInfo("Workers", es, "%s.  ", workersInformationText.data);
+            }
+            else
+            {
+                ExplainCloseGroup("Workers", "Workers", true, es);
+            }
+        }
+
+        /* Executor memory high-water mark */
+        cdbexplain_formatMemory(maxbuf, sizeof(maxbuf), ss->peakmemused.vmax);
+        if (ss->peakmemused.vcnt == 1)
+        {
+            if (es->format == EXPLAIN_FORMAT_TEXT)
+            {
+                const char *seg = segbuf;
+
+                if (ss->peakmemused.imax >= 0)
+                {
+                    cdbexplain_formatSeg(segbuf, sizeof(segbuf), ss->peakmemused.imax, 999);
+                }
+                else if (slice && list_length(slice->segments) > 0)
+                {
+                    seg = " (entry db)";
+                }
+                else
+                {
+                    seg = "";
+                }
+                appendStringInfo(es->str,
+                                 "Executor memory: %s%s.",
+                                 maxbuf,
+                                 seg);
+            }
+            else
+            {
+                ExplainPropertyInteger("Executor Memory", "kB", ss->peakmemused.vmax, es);
+            }
+        }
+        else if (ss->peakmemused.vcnt > 1)
+        {
+            if (es->format == EXPLAIN_FORMAT_TEXT)
+            {
+                cdbexplain_formatMemory(avgbuf, sizeof(avgbuf), cdbexplain_agg_avg(&ss->peakmemused));
+                cdbexplain_formatSeg(segbuf, sizeof(segbuf), ss->peakmemused.imax, ss->nworker);
+                appendStringInfo(es->str,
+                                 "Executor memory: %s avg x %d workers, %s max%s.",
+                                 avgbuf,
+                                 ss->peakmemused.vcnt,
+                                 maxbuf,
+                                 segbuf);
+            }
+            else
+            {
+                ExplainOpenGroup("Executor Memory", "Executor Memory", true, es);
+                ExplainPropertyInteger("Average", "kB", cdbexplain_agg_avg(&ss->peakmemused), es);
+                ExplainPropertyInteger("Workers", NULL, ss->peakmemused.vcnt, es);
+                ExplainPropertyInteger("Maximum Memory Used", "kB", ss->peakmemused.vmax, es);
+                ExplainCloseGroup("Executor Memory", "Executor Memory", true, es);
+            }
+        }
+
+        if (EXPLAIN_MEMORY_VERBOSITY_SUPPRESS < explain_memory_verbosity)
+        {
+            /* Vmem reserved by QEs */
+            cdbexplain_formatMemory(maxbuf, sizeof(maxbuf), ss->vmem_reserved.vmax);
+            if (ss->vmem_reserved.vcnt == 1)
+            {
+
+                if (es->format == EXPLAIN_FORMAT_TEXT)
+                {
+                    const char *seg = segbuf;
+
+                    if (ss->vmem_reserved.imax >= 0)
+                    {
+                        cdbexplain_formatSeg(segbuf, sizeof(segbuf), ss->vmem_reserved.imax, 999);
+                    }
+                    else if (slice && list_length(slice->segments) > 0)
+                    {
+                        seg = " (entry db)";
+                    }
+                    else
+                    {
+                        seg = "";
+                    }
+                    appendStringInfo(es->str,
+                                     "  Vmem reserved: %s%s.",
+                                     maxbuf,
+                                     seg);
+                }
+                else
+                {
+                    ExplainPropertyInteger("Virtual Memory", "kB", ss->vmem_reserved.vmax, es);
+                }
+            }
+            else if (ss->vmem_reserved.vcnt > 1)
+            {
+                if (es->format == EXPLAIN_FORMAT_TEXT)
+                {
+                    cdbexplain_formatMemory(avgbuf, sizeof(avgbuf), cdbexplain_agg_avg(&ss->vmem_reserved));
+                    cdbexplain_formatSeg(segbuf, sizeof(segbuf), ss->vmem_reserved.imax, ss->nworker);
+                    appendStringInfo(es->str,
+                                     "  Vmem reserved: %s avg x %d workers, %s max%s.",
+                                     avgbuf,
+                                     ss->vmem_reserved.vcnt,
+                                     maxbuf,
+                                     segbuf);
+                }
+                else
+                {
+                    ExplainOpenGroup("Virtual Memory", "Virtual Memory", true, es);
+                    ExplainPropertyInteger("Average", "kB", cdbexplain_agg_avg(&ss->vmem_reserved), es);
+                    ExplainPropertyInteger("Workers", NULL, ss->vmem_reserved.vcnt, es);
+                    ExplainPropertyInteger("Maximum Memory Used", "kB", ss->vmem_reserved.vmax, es);
+                    ExplainCloseGroup("Virtual Memory", "Virtual Memory", true, es);
+                }
+
+            }
+        }
+
+        /* Work_mem used/wanted (max over all nodes and workers of slice) */
+        if (ss->workmemused_max + ss->workmemwanted_max > 0)
+        {
+            if (es->format == EXPLAIN_FORMAT_TEXT)
+            {
+                cdbexplain_formatMemory(maxbuf, sizeof(maxbuf), ss->workmemused_max);
+                appendStringInfo(es->str, "  Work_mem: %s max", maxbuf);
+                if (ss->workmemwanted_max > 0)
+                {
+                    es->str->data[flag] = '*';	/* draw attention to this slice */
+                    cdbexplain_formatMemory(maxbuf, sizeof(maxbuf), ss->workmemwanted_max);
+                    appendStringInfo(es->str, ", %s wanted", maxbuf);
+                }
+                appendStringInfoChar(es->str, '.');
+            }
+            else
+            {
+                ExplainPropertyInteger("Work Maximum Memory", "kB", ss->workmemused_max, es);
+            }
+        }
+
+        if (es->format == EXPLAIN_FORMAT_TEXT)
+            appendStringInfoChar(es->str, '\n');
+
+        ExplainCloseGroup("Slice", NULL, true, es);
+    }
+
+    if (showstatctx->nslice > 0)
+        ExplainCloseGroup("Slice statistics", "Slice statistics", false, es);
+
+    if (total_memory_across_slices > 0)
+    {
+        if (es->format == EXPLAIN_FORMAT_TEXT)
+        {
+            appendStringInfo(es->str, "Total memory used across slices: %.0fK bytes \n", total_memory_across_slices);
+        }
+        else
+        {
+            ExplainPropertyInteger("Total memory used across slices", "bytes", total_memory_across_slices, es);
+        }
+    }
+}
+
+/*
+ * Emit a JSON line ending.
+ *
+ * JSON requires a comma after each property but the last.  To facilitate this,
+ * in JSON format, the text emitted for each property begins just prior to the
+ * preceding line-break (and comma, if applicable).
+ */
+static void
+ExplainJSONLineEnding(ExplainState *es)
+{
+	Assert(es->format == EXPLAIN_FORMAT_JSON);
+	if (linitial_int(es->grouping_stack) != 0)
+		appendStringInfoChar(es->str, ',');
+	else
+		linitial_int(es->grouping_stack) = 1;
+	appendStringInfoChar(es->str, '\n');
+}						
+
+/* Compute elapsed time in seconds since given timestamp */
+static double
+elapsed_time(instr_time *starttime)
+{
+	instr_time	endtime;
+
+	INSTR_TIME_SET_CURRENT(endtime);
+	INSTR_TIME_SUBTRACT(endtime, *starttime);
+	return INSTR_TIME_GET_DOUBLE(endtime);
+}
+
+/*
+ * Explain a list of children of a CustomScan.
+ */
+static void
+ExplainCustomChildren(CustomScanState *css, List *ancestors, ExplainState *es)
+{
+	ListCell   *cell;
+	const char *label =
+	(list_length(css->custom_ps) != 1 ? "children" : "child");
+
+	foreach(cell, css->custom_ps)
+		VecExplainNode((PlanState *) lfirst(cell), ancestors, label, NULL, es);
+}
+
+
+/*
+ * Show the targetlist of a plan node
+ */
+static void
+show_plan_tlist(PlanState *planstate, List *ancestors, ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+	List	   *context;
+	List	   *result = NIL;
+	bool		useprefix;
+	ListCell   *lc;
+
+	/* No work if empty tlist (this occurs eg in bitmap indexscans) */
+	if (plan->targetlist == NIL)
+		return;
+	/* The tlist of an Append isn't real helpful, so suppress it */
+	if (IsA(plan, Append))
+		return;
+	/* Likewise for MergeAppend and RecursiveUnion */
+	if (IsA(plan, MergeAppend))
+		return;
+	if (IsA(plan, RecursiveUnion))
+		return;
+
+	/*
+	 * Likewise for ForeignScan that executes a direct INSERT/UPDATE/DELETE
+	 *
+	 * Note: the tlist for a ForeignScan that executes a direct INSERT/UPDATE
+	 * might contain subplan output expressions that are confusing in this
+	 * context.  The tlist for a ForeignScan that executes a direct UPDATE/
+	 * DELETE always contains "junk" target columns to identify the exact row
+	 * to update or delete, which would be confusing in this context.  So, we
+	 * suppress it in all the cases.
+	 */
+	if (IsA(plan, ForeignScan) &&
+		((ForeignScan *) plan)->operation != CMD_SELECT)
+		return;
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   plan,
+									   ancestors);
+	useprefix = list_length(es->rtable) > 1;
+
+	/* Deparse each result column (we now include resjunk ones) */
+	foreach(lc, plan->targetlist)
+	{
+		TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+		result = lappend(result,
+						 deparse_expression((Node *) tle->expr, context,
+											useprefix, false));
+	}
+
+	/* Print results */
+	ExplainPropertyList("Output", result, es);
+}
+
+/*
+ * Show a generic expression
+ */
+static void
+show_expression(Node *node, const char *qlabel,
+				PlanState *planstate, List *ancestors,
+				bool useprefix, ExplainState *es)
+{
+	List	   *context;
+	char	   *exprstr;
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   planstate->plan,
+									   ancestors);
+
+	/* Deparse the expression */
+	exprstr = deparse_expression(node, context, useprefix, false);
+
+	/* And add to es->str */
+	ExplainPropertyText(qlabel, exprstr, es);
+}
+
+/*
+ * Show a qualifier expression (which is a List with implicit AND semantics)
+ */
+static void
+show_qual(List *qual, const char *qlabel,
+		  PlanState *planstate, List *ancestors,
+		  bool useprefix, ExplainState *es)
+{
+	Node	   *node;
+
+	/* No work if empty qual */
+	if (qual == NIL)
+		return;
+
+	/* Convert AND list to explicit AND */
+	node = (Node *) make_ands_explicit(qual);
+
+	/* And show it */
+	show_expression(node, qlabel, planstate, ancestors, useprefix, es);
+}
+
+/*
+ * Show a qualifier expression for a scan plan node
+ */
+static void
+show_scan_qual(List *qual, const char *qlabel,
+			   PlanState *planstate, List *ancestors,
+			   ExplainState *es)
+{
+	bool		useprefix;
+
+	useprefix = (IsA(planstate->plan, SubqueryScan) || es->verbose);
+	show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
+}
+
+/*
+ * Show a qualifier expression for an upper-level plan node
+ */
+static void
+show_upper_qual(List *qual, const char *qlabel,
+				PlanState *planstate, List *ancestors,
+				ExplainState *es)
+{
+	bool		useprefix;
+
+	useprefix = (list_length(es->rtable) > 1 || es->verbose);
+	show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
+}
+
+/*
+ * Show the sort keys for a Sort node.
+ */
+static void
+show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es)
+{
+	Sort	   *plan = (Sort *) sortstate->ss.ps.plan;
+	const char *SortKeystr;
+
+	if (sortstate->noduplicates)
+		SortKeystr = "Sort Key (Distinct)";
+	else
+		SortKeystr = "Sort Key";
+
+	show_sort_group_keys((PlanState *) sortstate, SortKeystr,
+						 plan->numCols, 0, plan->sortColIdx,
+						 plan->sortOperators, plan->collations,
+						 plan->nullsFirst,
+						 ancestors, es);
+}
+
+static void
+show_windowagg_keys(WindowAggState *waggstate, List *ancestors, ExplainState *es)
+{
+	WindowAgg *window = (WindowAgg *) waggstate->ss.ps.plan;
+
+	/* The key columns refer to the tlist of the child plan */
+	ancestors = lcons(window, ancestors);
+	if ( window->partNumCols > 0 )
+	{
+		show_sort_group_keys((PlanState *) outerPlanState(waggstate), "Partition By",
+							 window->partNumCols, 0, window->partColIdx,
+							 NULL, NULL, NULL,
+							 ancestors, es);
+	}
+
+	show_sort_group_keys((PlanState *) outerPlanState(waggstate), "Order By",
+						 window->ordNumCols, 0, window->ordColIdx,
+						 NULL, NULL, NULL,
+						 ancestors, es);
+	ancestors = list_delete_first(ancestors);
+
+	/* XXX don't show framing for now */
+}
+
+
+
+/*
+ * Show the sort keys for a IncrementalSort node.
+ */
+static void
+show_incremental_sort_keys(IncrementalSortState *incrsortstate,
+						   List *ancestors, ExplainState *es)
+{
+	IncrementalSort *plan = (IncrementalSort *) incrsortstate->ss.ps.plan;
+
+	show_sort_group_keys((PlanState *) incrsortstate, "Sort Key",
+						 plan->sort.numCols, plan->nPresortedCols,
+						 plan->sort.sortColIdx,
+						 plan->sort.sortOperators, plan->sort.collations,
+						 plan->sort.nullsFirst,
+						 ancestors, es);
+}
+
+/*
+ * Likewise, for a MergeAppend node.
+ */
+static void
+show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
+					   ExplainState *es)
+{
+	MergeAppend *plan = (MergeAppend *) mstate->ps.plan;
+
+	show_sort_group_keys((PlanState *) mstate, "Sort Key",
+						 plan->numCols, 0, plan->sortColIdx,
+						 plan->sortOperators, plan->collations,
+						 plan->nullsFirst,
+						 ancestors, es);
+}
+
+/*
+ * Show the Split key for an SplitTuple
+ */
+static void
+show_tuple_split_keys(TupleSplitState *tstate, List *ancestors,
+					  ExplainState *es)
+{
+	TupleSplit *plan = (TupleSplit *)tstate->ss.ps.plan;
+
+	ancestors = lcons(tstate, ancestors);
+
+	List	   *context;
+	bool		useprefix;
+	List	   *result = NIL;
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   (Plan *) plan,
+									   ancestors);
+	useprefix = (list_length(es->rtable) > 1 || es->verbose);
+
+	StringInfoData buf;
+	initStringInfo(&buf);
+
+	ListCell *lc;
+	foreach(lc, plan->dqa_expr_lst)
+	{
+		DQAExpr *dqa_expr = (DQAExpr *)lfirst(lc);
+		result = lappend(result,
+		                 deparse_expression((Node *) dqa_expr, context,
+		                                    useprefix, true));
+	}
+	ExplainPropertyList("Split by Col", result, es);
+
+	if (plan->numCols > 0)
+		show_sort_group_keys(outerPlanState(tstate), "Group Key",
+							 plan->numCols, 0, plan->grpColIdx,
+							 NULL, NULL, NULL,
+							 ancestors, es);
+
+	ancestors = list_delete_first(ancestors);
+}
+
+/*
+ * Show the grouping keys for an Agg node.
+ */
+static void
+show_agg_keys(AggState *astate, List *ancestors,
+			  ExplainState *es)
+{
+	Agg		   *plan = (Agg *) astate->ss.ps.plan;
+
+	if (plan->numCols > 0 || plan->groupingSets)
+	{
+		/* The key columns refer to the tlist of the child plan */
+		ancestors = lcons(plan, ancestors);
+
+		if (plan->groupingSets)
+			show_grouping_sets(outerPlanState(astate), plan, ancestors, es);
+		else
+			show_sort_group_keys(outerPlanState(astate), "Group Key",
+								 plan->numCols, 0, plan->grpColIdx,
+								 NULL, NULL, NULL,
+								 ancestors, es);
+
+		ancestors = list_delete_first(ancestors);
+	}
+}
+
+static void
+show_grouping_sets(PlanState *planstate, Agg *agg,
+				   List *ancestors, ExplainState *es)
+{
+	List	   *context;
+	bool		useprefix;
+	ListCell   *lc;
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   planstate->plan,
+									   ancestors);
+	useprefix = (list_length(es->rtable) > 1 || es->verbose);
+
+	ExplainOpenGroup("Grouping Sets", "Grouping Sets", false, es);
+
+	show_grouping_set_keys(planstate, agg, NULL,
+						   context, useprefix, ancestors, es);
+
+	foreach(lc, agg->chain)
+	{
+		Agg		   *aggnode = lfirst(lc);
+		Sort	   *sortnode = (Sort *) aggnode->plan.lefttree;
+
+		show_grouping_set_keys(planstate, aggnode, sortnode,
+							   context, useprefix, ancestors, es);
+	}
+
+	ExplainCloseGroup("Grouping Sets", "Grouping Sets", false, es);
+}
+
+static void
+show_grouping_set_keys(PlanState *planstate,
+					   Agg *aggnode, Sort *sortnode,
+					   List *context, bool useprefix,
+					   List *ancestors, ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+	char	   *exprstr;
+	ListCell   *lc;
+	List	   *gsets = aggnode->groupingSets;
+	AttrNumber *keycols = aggnode->grpColIdx;
+	const char *keyname;
+	const char *keysetname;
+
+	if (aggnode->aggstrategy == AGG_HASHED || aggnode->aggstrategy == AGG_MIXED)
+	{
+		keyname = "Hash Key";
+		keysetname = "Hash Keys";
+	}
+	else
+	{
+		keyname = "Group Key";
+		keysetname = "Group Keys";
+	}
+
+	ExplainOpenGroup("Grouping Set", NULL, true, es);
+
+	if (sortnode)
+	{
+		show_sort_group_keys(planstate, "Sort Key",
+							 sortnode->numCols, 0, sortnode->sortColIdx,
+							 sortnode->sortOperators, sortnode->collations,
+							 sortnode->nullsFirst,
+							 ancestors, es);
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+			es->indent++;
+	}
+
+	ExplainOpenGroup(keysetname, keysetname, false, es);
+
+	foreach(lc, gsets)
+	{
+		List	   *result = NIL;
+		ListCell   *lc2;
+
+		foreach(lc2, (List *) lfirst(lc))
+		{
+			Index		i = lfirst_int(lc2);
+			AttrNumber	keyresno = keycols[i];
+			TargetEntry *target = get_tle_by_resno(plan->targetlist,
+												   keyresno);
+
+			if (!target)
+				elog(ERROR, "no tlist entry for key %d", keyresno);
+			/* Deparse the expression, showing any top-level cast */
+			exprstr = deparse_expression((Node *) target->expr, context,
+										 useprefix, true);
+
+			result = lappend(result, exprstr);
+		}
+
+		if (!result && es->format == EXPLAIN_FORMAT_TEXT)
+			ExplainPropertyText(keyname, "()", es);
+		else
+			ExplainPropertyListNested(keyname, result, es);
+	}
+
+	ExplainCloseGroup(keysetname, keysetname, false, es);
+
+	if (sortnode && es->format == EXPLAIN_FORMAT_TEXT)
+		es->indent--;
+
+	ExplainCloseGroup("Grouping Set", NULL, true, es);
+}
+
+/*
+ * Show the grouping keys for a Group node.
+ */
+#if 0
+static void
+show_group_keys(GroupState *gstate, List *ancestors,
+				ExplainState *es)
+{
+	Group	   *plan = (Group *) gstate->ss.ps.plan;
+
+	/* The key columns refer to the tlist of the child plan */
+	ancestors = lcons(plan, ancestors);
+	show_sort_group_keys(outerPlanState(gstate), "Group Key",
+						 plan->numCols, 0, plan->grpColIdx,
+						 NULL, NULL, NULL,
+						 ancestors, es);
+	ancestors = list_delete_first(ancestors);
+}
+#endif
+
+/*
+ * Common code to show sort/group keys, which are represented in plan nodes
+ * as arrays of targetlist indexes.  If it's a sort key rather than a group
+ * key, also pass sort operators/collations/nullsFirst arrays.
+ */
+static void
+show_sort_group_keys(PlanState *planstate, const char *qlabel,
+					 int nkeys, int nPresortedKeys, AttrNumber *keycols,
+					 Oid *sortOperators, Oid *collations, bool *nullsFirst,
+					 List *ancestors, ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+	List	   *context;
+	List	   *result = NIL;
+	List	   *resultPresorted = NIL;
+	StringInfoData sortkeybuf;
+	bool		useprefix;
+	int			keyno;
+
+	if (nkeys <= 0)
+		return;
+
+	initStringInfo(&sortkeybuf);
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   plan,
+									   ancestors);
+	useprefix = (list_length(es->rtable) > 1 || es->verbose);
+
+	for (keyno = 0; keyno < nkeys; keyno++)
+	{
+		/* find key expression in tlist */
+		AttrNumber	keyresno = keycols[keyno];
+		TargetEntry *target = get_tle_by_resno(plan->targetlist,
+											   keyresno);
+		char	   *exprstr;
+
+		if (!target)
+			elog(ERROR, "no tlist entry for key %d", keyresno);
+		/* Deparse the expression, showing any top-level cast */
+		exprstr = deparse_expression((Node *) target->expr, context,
+									 useprefix, true);
+		resetStringInfo(&sortkeybuf);
+		appendStringInfoString(&sortkeybuf, exprstr);
+		/* Append sort order information, if relevant */
+		if (sortOperators != NULL)
+			show_sortorder_options(&sortkeybuf,
+								   (Node *) target->expr,
+								   sortOperators[keyno],
+								   collations[keyno],
+								   nullsFirst[keyno]);
+		/* Emit one property-list item per sort key */
+		result = lappend(result, pstrdup(sortkeybuf.data));
+		if (keyno < nPresortedKeys)
+			resultPresorted = lappend(resultPresorted, exprstr);
+	}
+
+	ExplainPropertyList(qlabel, result, es);
+
+	/*
+	 * GPDB_90_MERGE_FIXME: handle rollup times printing
+	 * if (rollup_gs_times > 1)
+	 *	appendStringInfo(es->str, " (%d times)", rollup_gs_times);
+	 */
+	if (nPresortedKeys > 0)
+		ExplainPropertyList("Presorted Key", resultPresorted, es);
+}
+
+/*
+ * Append nondefault characteristics of the sort ordering of a column to buf
+ * (collation, direction, NULLS FIRST/LAST)
+ */
+static void
+show_sortorder_options(StringInfo buf, Node *sortexpr,
+					   Oid sortOperator, Oid collation, bool nullsFirst)
+{
+	Oid			sortcoltype = exprType(sortexpr);
+	bool		reverse = false;
+	TypeCacheEntry *typentry;
+
+	typentry = lookup_type_cache(sortcoltype,
+								 TYPECACHE_LT_OPR | TYPECACHE_GT_OPR);
+
+	/*
+	 * Print COLLATE if it's not default for the column's type.  There are
+	 * some cases where this is redundant, eg if expression is a column whose
+	 * declared collation is that collation, but it's hard to distinguish that
+	 * here (and arguably, printing COLLATE explicitly is a good idea anyway
+	 * in such cases).
+	 */
+	if (OidIsValid(collation) && collation != get_typcollation(sortcoltype))
+	{
+		char	   *collname = get_collation_name(collation);
+
+		if (collname == NULL)
+			elog(ERROR, "cache lookup failed for collation %u", collation);
+		appendStringInfo(buf, " COLLATE %s", quote_identifier(collname));
+	}
+
+	/* Print direction if not ASC, or USING if non-default sort operator */
+	if (sortOperator == typentry->gt_opr)
+	{
+		appendStringInfoString(buf, " DESC");
+		reverse = true;
+	}
+	else if (sortOperator != typentry->lt_opr)
+	{
+		char	   *opname = get_opname(sortOperator);
+
+		if (opname == NULL)
+			elog(ERROR, "cache lookup failed for operator %u", sortOperator);
+		appendStringInfo(buf, " USING %s", opname);
+		/* Determine whether operator would be considered ASC or DESC */
+		(void) get_equality_op_for_ordering_op(sortOperator, &reverse);
+	}
+
+	/* Add NULLS FIRST/LAST only if it wouldn't be default */
+	if (nullsFirst && !reverse)
+	{
+		appendStringInfoString(buf, " NULLS FIRST");
+	}
+	else if (!nullsFirst && reverse)
+	{
+		appendStringInfoString(buf, " NULLS LAST");
+	}
+}
+
+/*
+ * Show TABLESAMPLE properties
+ */
+static void
+show_tablesample(TableSampleClause *tsc, PlanState *planstate,
+				 List *ancestors, ExplainState *es)
+{
+	List	   *context;
+	bool		useprefix;
+	char	   *method_name;
+	List	   *params = NIL;
+	char	   *repeatable;
+	ListCell   *lc;
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   planstate->plan,
+									   ancestors);
+	useprefix = list_length(es->rtable) > 1;
+
+	/* Get the tablesample method name */
+	method_name = get_func_name(tsc->tsmhandler);
+
+	/* Deparse parameter expressions */
+	foreach(lc, tsc->args)
+	{
+		Node	   *arg = (Node *) lfirst(lc);
+
+		params = lappend(params,
+						 deparse_expression(arg, context,
+											useprefix, false));
+	}
+	if (tsc->repeatable)
+		repeatable = deparse_expression((Node *) tsc->repeatable, context,
+										useprefix, false);
+	else
+		repeatable = NULL;
+
+	/* Print results */
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		bool		first = true;
+
+		ExplainIndentText(es);
+		appendStringInfo(es->str, "Sampling: %s (", method_name);
+		foreach(lc, params)
+		{
+			if (!first)
+				appendStringInfoString(es->str, ", ");
+			appendStringInfoString(es->str, (const char *) lfirst(lc));
+			first = false;
+		}
+		appendStringInfoChar(es->str, ')');
+		if (repeatable)
+			appendStringInfo(es->str, " REPEATABLE (%s)", repeatable);
+		appendStringInfoChar(es->str, '\n');
+	}
+	else
+	{
+		ExplainPropertyText("Sampling Method", method_name, es);
+		ExplainPropertyList("Sampling Parameters", params, es);
+		if (repeatable)
+			ExplainPropertyText("Repeatable Seed", repeatable, es);
+	}
+}
+
+/*
+ * If it's EXPLAIN ANALYZE, show tuplesort stats for a sort node
+ *
+ * GPDB_90_MERGE_FIXME: The sort statistics are stored quite differently from
+ * upstream, it would be nice to rewrite this to avoid looping over all the
+ * sort methods and instead have a _get_stats() function as in upstream.
+ */
+static void
+show_sort_info(SortState *sortstate, ExplainState *es)
+{
+	CdbExplain_NodeSummary *ns;
+	int			i;
+
+	if (!es->analyze)
+		return;
+
+	ns = es->runtime ? ((PlanState *)sortstate)->instrument->rt_cdbNodeSummary : ((PlanState *)sortstate)->instrument->cdbNodeSummary;
+	if (!ns)
+		return;
+
+	for (i = 0; i < NUM_SORT_METHOD; i++)
+	{
+		CdbExplain_Agg	*agg;
+		const char *sortMethod;
+		const char *spaceType;
+		int			j;
+
+		/*
+		 * Memory and disk usage statistics are saved separately in GPDB so
+		 * need to pull out the one in question first
+		 */
+		for (j = 0; j < NUM_SORT_SPACE_TYPE; j++)
+		{
+			agg = &ns->sortSpaceUsed[j][i];
+
+			if (agg->vcnt > 0)
+				break;
+		}
+		/*
+		 * If the current sort method in question hasn't been used, skip to
+		 * next one
+		 */
+		if (j >= NUM_SORT_SPACE_TYPE)
+			continue;
+
+		sortMethod = tuplesort_method_name(i);
+		spaceType = tuplesort_space_type_name(j);
+
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfo(es->str, "Sort Method:  %s  %s: " INT64_FORMAT "kB\n",
+				sortMethod, spaceType, (long) agg->vsum);
+			if (es->verbose)
+			{
+				appendStringInfo(es->str, "  Max Memory: " INT64_FORMAT "kB  Avg Memory: " INT64_FORMAT "kb (%d segments)\n",
+								 (long) agg->vmax,
+								 (long) (agg->vsum / agg->vcnt),
+								 agg->vcnt);
+			}
+		}
+		else
+		{
+			ExplainPropertyText("Sort Method", sortMethod, es);
+			ExplainPropertyInteger("Sort Space Used", "kB", agg->vsum, es);
+			ExplainPropertyText("Sort Space Type", spaceType, es);
+			if (es->verbose)
+			{
+				ExplainPropertyInteger("Sort Max Segment Memory", "kB", agg->vmax, es);
+				ExplainPropertyInteger("Sort Avg Segment Memory", "kB", (agg->vsum / agg->vcnt), es);
+				ExplainPropertyInteger("Sort Segments", NULL, agg->vcnt, es);
+			}
+		}
+	}
+
+	/*
+	 * You might think we should just skip this stanza entirely when
+	 * es->hide_workers is true, but then we'd get no sort-method output at
+	 * all.  We have to make it look like worker 0's data is top-level data.
+	 * This is easily done by just skipping the OpenWorker/CloseWorker calls.
+	 * Currently, we don't worry about the possibility that there are multiple
+	 * workers in such a case; if there are, duplicate output fields will be
+	 * emitted.
+	 */
+	if (sortstate->shared_info != NULL)
+	{
+		int			n;
+
+		for (n = 0; n < sortstate->shared_info->num_workers; n++)
+		{
+			TuplesortInstrumentation *sinstrument;
+			const char *sortMethod;
+			const char *spaceType;
+			int64		spaceUsed;
+
+			sinstrument = &sortstate->shared_info->sinstrument[n];
+			if (sinstrument->sortMethod == SORT_TYPE_STILL_IN_PROGRESS)
+				continue;		/* ignore any unfilled slots */
+			sortMethod = tuplesort_method_name(sinstrument->sortMethod);
+			spaceType = tuplesort_space_type_name(sinstrument->spaceType);
+			spaceUsed = sinstrument->spaceUsed;
+
+			if (es->workers_state)
+				ExplainOpenWorker(n, es);
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+				appendStringInfo(es->str,
+								 "Sort Method: %s  %s: " INT64_FORMAT "kB\n",
+								 sortMethod, spaceType, spaceUsed);
+			}
+			else
+			{
+				ExplainPropertyText("Sort Method", sortMethod, es);
+				ExplainPropertyInteger("Sort Space Used", "kB", spaceUsed, es);
+				ExplainPropertyText("Sort Space Type", spaceType, es);
+			}
+
+			if (es->workers_state)
+				ExplainCloseWorker(n, es);
+		}
+	}
+}
+
+/*
+ * Incremental sort nodes sort in (a potentially very large number of) batches,
+ * so EXPLAIN ANALYZE needs to roll up the tuplesort stats from each batch into
+ * an intelligible summary.
+ *
+ * This function is used for both a non-parallel node and each worker in a
+ * parallel incremental sort node.
+ */
+static void
+show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
+								 const char *groupLabel, bool indent, ExplainState *es)
+{
+	ListCell   *methodCell;
+	List	   *methodNames = NIL;
+
+	/* Generate a list of sort methods used across all groups. */
+	for (int bit = 0; bit < NUM_TUPLESORTMETHODS; bit++)
+	{
+		TuplesortMethod sortMethod = (1 << bit);
+
+		if (groupInfo->sortMethods & sortMethod)
+		{
+			const char *methodName = tuplesort_method_name(sortMethod);
+
+			methodNames = lappend(methodNames, unconstify(char *, methodName));
+		}
+	}
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		if (indent)
+			appendStringInfoSpaces(es->str, es->indent * 2);
+		appendStringInfo(es->str, "%s Groups: " INT64_FORMAT "  Sort Method", groupLabel,
+						 groupInfo->groupCount);
+		/* plural/singular based on methodNames size */
+		if (list_length(methodNames) > 1)
+			appendStringInfoString(es->str, "s: ");
+		else
+			appendStringInfoString(es->str, ": ");
+		foreach(methodCell, methodNames)
+		{
+			appendStringInfoString(es->str, (char *) methodCell->ptr_value);
+			if (foreach_current_index(methodCell) < list_length(methodNames) - 1)
+				appendStringInfoString(es->str, ", ");
+		}
+
+		if (groupInfo->maxMemorySpaceUsed > 0)
+		{
+			int64		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			const char *spaceTypeName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+			appendStringInfo(es->str, "  Average %s: " INT64_FORMAT "kB  Peak %s: " INT64_FORMAT "kB",
+							 spaceTypeName, avgSpace,
+							 spaceTypeName, groupInfo->maxMemorySpaceUsed);
+		}
+
+		if (groupInfo->maxDiskSpaceUsed > 0)
+		{
+			int64		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+
+			const char *spaceTypeName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+			appendStringInfo(es->str, "  Average %s: " INT64_FORMAT "kB  Peak %s: " INT64_FORMAT "kB",
+							 spaceTypeName, avgSpace,
+							 spaceTypeName, groupInfo->maxDiskSpaceUsed);
+		}
+	}
+	else
+	{
+		StringInfoData groupName;
+
+		initStringInfo(&groupName);
+		appendStringInfo(&groupName, "%s Groups", groupLabel);
+		ExplainOpenGroup("Incremental Sort Groups", groupName.data, true, es);
+		ExplainPropertyInteger("Group Count", NULL, groupInfo->groupCount, es);
+
+		ExplainPropertyList("Sort Methods Used", methodNames, es);
+
+		if (groupInfo->maxMemorySpaceUsed > 0)
+		{
+			int64		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			const char *spaceTypeName;
+			StringInfoData memoryName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
+			initStringInfo(&memoryName);
+			appendStringInfo(&memoryName, "Sort Space %s", spaceTypeName);
+			ExplainOpenGroup("Sort Space", memoryName.data, true, es);
+
+			ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+			ExplainPropertyInteger("Peak Sort Space Used", "kB",
+								   groupInfo->maxMemorySpaceUsed, es);
+
+			ExplainCloseGroup("Sort Space", memoryName.data, true, es);
+		}
+		if (groupInfo->maxDiskSpaceUsed > 0)
+		{
+			int64		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+			const char *spaceTypeName;
+			StringInfoData diskName;
+
+			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
+			initStringInfo(&diskName);
+			appendStringInfo(&diskName, "Sort Space %s", spaceTypeName);
+			ExplainOpenGroup("Sort Space", diskName.data, true, es);
+
+			ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es);
+			ExplainPropertyInteger("Peak Sort Space Used", "kB",
+								   groupInfo->maxDiskSpaceUsed, es);
+
+			ExplainCloseGroup("Sort Space", diskName.data, true, es);
+		}
+
+		ExplainCloseGroup("Incremental Sort Groups", groupName.data, true, es);
+	}
+}
+
+/*
+ * If it's EXPLAIN ANALYZE, show tuplesort stats for an incremental sort node
+ */
+static void
+show_incremental_sort_info(IncrementalSortState *incrsortstate,
+						   ExplainState *es)
+{
+	IncrementalSortGroupInfo *fullsortGroupInfo;
+	IncrementalSortGroupInfo *prefixsortGroupInfo;
+
+	fullsortGroupInfo = &incrsortstate->incsort_info.fullsortGroupInfo;
+
+	if (!es->analyze)
+		return;
+
+	/*
+	 * Since we never have any prefix groups unless we've first sorted a full
+	 * groups and transitioned modes (copying the tuples into a prefix group),
+	 * we don't need to do anything if there were 0 full groups.
+	 *
+	 * We still have to continue after this block if there are no full groups,
+	 * though, since it's possible that we have workers that did real work
+	 * even if the leader didn't participate.
+	 */
+	if (fullsortGroupInfo->groupCount > 0)
+	{
+		show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", true, es);
+		prefixsortGroupInfo = &incrsortstate->incsort_info.prefixsortGroupInfo;
+		if (prefixsortGroupInfo->groupCount > 0)
+		{
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+				appendStringInfoChar(es->str, '\n');
+			show_incremental_sort_group_info(prefixsortGroupInfo, "Pre-sorted", true, es);
+		}
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+			appendStringInfoChar(es->str, '\n');
+	}
+
+	if (incrsortstate->shared_info != NULL)
+	{
+		int			n;
+		bool		indent_first_line;
+
+		for (n = 0; n < incrsortstate->shared_info->num_workers; n++)
+		{
+			IncrementalSortInfo *incsort_info =
+			&incrsortstate->shared_info->sinfo[n];
+
+			/*
+			 * If a worker hasn't processed any sort groups at all, then
+			 * exclude it from output since it either didn't launch or didn't
+			 * contribute anything meaningful.
+			 */
+			fullsortGroupInfo = &incsort_info->fullsortGroupInfo;
+
+			/*
+			 * Since we never have any prefix groups unless we've first sorted
+			 * a full groups and transitioned modes (copying the tuples into a
+			 * prefix group), we don't need to do anything if there were 0
+			 * full groups.
+			 */
+			if (fullsortGroupInfo->groupCount == 0)
+				continue;
+
+			if (es->workers_state)
+				ExplainOpenWorker(n, es);
+
+			indent_first_line = es->workers_state == NULL || es->verbose;
+			show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort",
+											 indent_first_line, es);
+			prefixsortGroupInfo = &incsort_info->prefixsortGroupInfo;
+			if (prefixsortGroupInfo->groupCount > 0)
+			{
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+					appendStringInfoChar(es->str, '\n');
+				show_incremental_sort_group_info(prefixsortGroupInfo, "Pre-sorted", true, es);
+			}
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+				appendStringInfoChar(es->str, '\n');
+
+			if (es->workers_state)
+				ExplainCloseWorker(n, es);
+		}
+	}
+}
+
+/*
+ * Show information on hash buckets/batches.
+ */
+static void
+show_hash_info(HashState *hashstate, ExplainState *es)
+{
+	HashInstrumentation hinstrument = {0};
+
+	/*
+	 * Collect stats from the local process, even when it's a parallel query.
+	 * In a parallel query, the leader process may or may not have run the
+	 * hash join, and even if it did it may not have built a hash table due to
+	 * timing (if it started late it might have seen no tuples in the outer
+	 * relation and skipped building the hash table).  Therefore we have to be
+	 * prepared to get instrumentation data from all participants.
+	 */
+	if (hashstate->hinstrument)
+		memcpy(&hinstrument, hashstate->hinstrument,
+			   sizeof(HashInstrumentation));
+	/*
+	 * Merge results from workers.  In the parallel-oblivious case, the
+	 * results from all participants should be identical, except where
+	 * participants didn't run the join at all so have no data.  In the
+	 * parallel-aware case, we need to consider all the results.  Each worker
+	 * may have seen a different subset of batches and we want to report the
+	 * highest memory usage across all batches.  We take the maxima of other
+	 * values too, for the same reasons as in ExecHashAccumInstrumentation.
+	 */
+	if (hashstate->shared_info)
+	{
+		SharedHashInfo *shared_info = hashstate->shared_info;
+		int			i;
+
+		for (i = 0; i < shared_info->num_workers; ++i)
+		{
+			HashInstrumentation *worker_hi = &shared_info->hinstrument[i];
+
+			hinstrument.nbuckets = Max(hinstrument.nbuckets,
+									   worker_hi->nbuckets);
+			hinstrument.nbuckets_original = Max(hinstrument.nbuckets_original,
+												worker_hi->nbuckets_original);
+			hinstrument.nbatch = Max(hinstrument.nbatch,
+									 worker_hi->nbatch);
+			hinstrument.nbatch_original = Max(hinstrument.nbatch_original,
+											  worker_hi->nbatch_original);
+			hinstrument.space_peak = Max(hinstrument.space_peak,
+										 worker_hi->space_peak);
+		}
+	}
+
+	if (hinstrument.nbatch > 0)
+	{
+		long		spacePeakKb = (hinstrument.space_peak + 1023) / 1024;
+
+		if (es->format != EXPLAIN_FORMAT_TEXT)
+		{
+			ExplainPropertyInteger("Hash Buckets", NULL,
+								   hinstrument.nbuckets, es);
+			ExplainPropertyInteger("Original Hash Buckets", NULL,
+								   hinstrument.nbuckets_original, es);
+			ExplainPropertyInteger("Hash Batches", NULL,
+								   hinstrument.nbatch, es);
+			ExplainPropertyInteger("Original Hash Batches", NULL,
+								   hinstrument.nbatch_original, es);
+			ExplainPropertyInteger("Peak Memory Usage", "kB",
+								   spacePeakKb, es);
+		}
+		else if (hinstrument.nbatch_original != hinstrument.nbatch ||
+				 hinstrument.nbuckets_original != hinstrument.nbuckets)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+							 "Buckets: %d (originally %d)  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
+							 hinstrument.nbuckets,
+							 hinstrument.nbuckets_original,
+							 hinstrument.nbatch,
+							 hinstrument.nbatch_original,
+							 spacePeakKb);
+		}
+		else
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+							 "Buckets: %d  Batches: %d  Memory Usage: %ldkB\n",
+							 hinstrument.nbuckets, hinstrument.nbatch,
+							 spacePeakKb);
+		}
+	}
+}
+
+static void
+show_runtime_filter_info(RuntimeFilterState *rfstate, ExplainState *es)
+{
+	if (es->analyze)
+	{
+		if (rfstate->bf != NULL)
+			ExplainPropertyUInteger("Bloom Bits", NULL,
+									bloom_total_bits(rfstate->bf), es);
+	}
+}
+
+/*
+ * Show information on memoize hits/misses/evictions and memory usage.
+ */
+static void
+show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es)
+{
+	Plan	   *plan = ((PlanState *) mstate)->plan;
+	ListCell   *lc;
+	List	   *context;
+	StringInfoData keystr;
+	char	   *seperator = "";
+	bool		useprefix;
+	int64		memPeakKb;
+
+	initStringInfo(&keystr);
+
+	/*
+	 * It's hard to imagine having a memoize node with fewer than 2 RTEs, but
+	 * let's just keep the same useprefix logic as elsewhere in this file.
+	 */
+	useprefix = list_length(es->rtable) > 1 || es->verbose;
+
+	/* Set up deparsing context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   plan,
+									   ancestors);
+
+	foreach(lc, ((Memoize *) plan)->param_exprs)
+	{
+		Node	   *expr = (Node *) lfirst(lc);
+
+		appendStringInfoString(&keystr, seperator);
+
+		appendStringInfoString(&keystr, deparse_expression(expr, context,
+														   useprefix, false));
+		seperator = ", ";
+	}
+
+	if (es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		ExplainPropertyText("Cache Key", keystr.data, es);
+		ExplainPropertyText("Cache Mode", mstate->binary_mode ? "binary" : "logical", es);
+	}
+	else
+	{
+		ExplainIndentText(es);
+		appendStringInfo(es->str, "Cache Key: %s\n", keystr.data);
+		ExplainIndentText(es);
+		appendStringInfo(es->str, "Cache Mode: %s\n", mstate->binary_mode ? "binary" : "logical");
+	}
+
+	pfree(keystr.data);
+
+	if (!es->analyze)
+		return;
+
+	if (mstate->stats.cache_misses > 0)
+	{
+		/*
+		 * mem_peak is only set when we freed memory, so we must use mem_used
+		 * when mem_peak is 0.
+		 */
+		if (mstate->stats.mem_peak > 0)
+			memPeakKb = (mstate->stats.mem_peak + 1023) / 1024;
+		else
+			memPeakKb = (mstate->mem_used + 1023) / 1024;
+
+		if (es->format != EXPLAIN_FORMAT_TEXT)
+		{
+			ExplainPropertyInteger("Cache Hits", NULL, mstate->stats.cache_hits, es);
+			ExplainPropertyInteger("Cache Misses", NULL, mstate->stats.cache_misses, es);
+			ExplainPropertyInteger("Cache Evictions", NULL, mstate->stats.cache_evictions, es);
+			ExplainPropertyInteger("Cache Overflows", NULL, mstate->stats.cache_overflows, es);
+			ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
+		}
+		else
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+							 "Hits: " UINT64_FORMAT "  Misses: " UINT64_FORMAT "  Evictions: " UINT64_FORMAT "  Overflows: " UINT64_FORMAT "  Memory Usage: " INT64_FORMAT "kB\n",
+							 mstate->stats.cache_hits,
+							 mstate->stats.cache_misses,
+							 mstate->stats.cache_evictions,
+							 mstate->stats.cache_overflows,
+							 memPeakKb);
+		}
+	}
+
+	if (mstate->shared_info == NULL)
+		return;
+
+	/* Show details from parallel workers */
+	for (int n = 0; n < mstate->shared_info->num_workers; n++)
+	{
+		MemoizeInstrumentation *si;
+
+		si = &mstate->shared_info->sinstrument[n];
+
+		/*
+		 * Skip workers that didn't do any work.  We needn't bother checking
+		 * for cache hits as a miss will always occur before a cache hit.
+		 */
+		if (si->cache_misses == 0)
+			continue;
+
+		if (es->workers_state)
+			ExplainOpenWorker(n, es);
+
+		/*
+		 * Since the worker's MemoizeState.mem_used field is unavailable to
+		 * us, ExecEndMemoize will have set the
+		 * MemoizeInstrumentation.mem_peak field for us.  No need to do the
+		 * zero checks like we did for the serial case above.
+		 */
+		memPeakKb = (si->mem_peak + 1023) / 1024;
+
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+							 "Hits: " UINT64_FORMAT "  Misses: " UINT64_FORMAT "  Evictions: " UINT64_FORMAT "  Overflows: " UINT64_FORMAT "  Memory Usage: " INT64_FORMAT "kB\n",
+							 si->cache_hits, si->cache_misses,
+							 si->cache_evictions, si->cache_overflows,
+							 memPeakKb);
+		}
+		else
+		{
+			ExplainPropertyInteger("Cache Hits", NULL,
+								   si->cache_hits, es);
+			ExplainPropertyInteger("Cache Misses", NULL,
+								   si->cache_misses, es);
+			ExplainPropertyInteger("Cache Evictions", NULL,
+								   si->cache_evictions, es);
+			ExplainPropertyInteger("Cache Overflows", NULL,
+								   si->cache_overflows, es);
+			ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb,
+								   es);
+		}
+
+		if (es->workers_state)
+			ExplainCloseWorker(n, es);
+	}
+}
+
+/*
+ * Show information on hash aggregate memory usage and batches.
+ */
+static void
+show_hashagg_info(AggState *aggstate, ExplainState *es)
+{
+	Agg		   *agg = (Agg *) aggstate->ss.ps.plan;
+	int64		memPeakKb = (aggstate->hash_mem_peak + 1023) / 1024;
+
+	if (agg->aggstrategy != AGG_HASHED &&
+		agg->aggstrategy != AGG_MIXED)
+		return;
+
+	if (es->format != EXPLAIN_FORMAT_TEXT)
+	{
+
+		if (es->costs)
+			ExplainPropertyInteger("Planned Partitions", NULL,
+								   aggstate->hash_planned_partitions, es);
+
+		/*
+		 * During parallel query the leader may have not helped out.  We
+		 * detect this by checking how much memory it used.  If we find it
+		 * didn't do any work then we don't show its properties.
+		 */
+		if (es->analyze && aggstate->hash_mem_peak > 0)
+		{
+			ExplainPropertyInteger("HashAgg Batches", NULL,
+								   aggstate->hash_batches_used, es);
+			ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
+			ExplainPropertyInteger("Disk Usage", "kB",
+								   aggstate->hash_disk_used, es);
+		}
+	}
+	else
+	{
+		bool		gotone = false;
+
+		if (es->costs && aggstate->hash_planned_partitions > 0)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Planned Partitions: %d",
+							 aggstate->hash_planned_partitions);
+			gotone = true;
+		}
+
+		/*
+		 * During parallel query the leader may have not helped out.  We
+		 * detect this by checking how much memory it used.  If we find it
+		 * didn't do any work then we don't show its properties.
+		 */
+		if (es->analyze && aggstate->hash_mem_peak > 0)
+		{
+			if (!gotone)
+				ExplainIndentText(es);
+			else
+				appendStringInfoString(es->str, "  ");
+
+			appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
+							 aggstate->hash_batches_used, memPeakKb);
+			gotone = true;
+
+			/* Only display disk usage if we spilled to disk */
+			if (aggstate->hash_batches_used > 1)
+			{
+				appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
+								 aggstate->hash_disk_used);
+			}
+		}
+
+		if (gotone)
+			appendStringInfoChar(es->str, '\n');
+	}
+
+	/* Display stats for each parallel worker */
+	if (es->analyze && aggstate->shared_info != NULL)
+	{
+		for (int n = 0; n < aggstate->shared_info->num_workers; n++)
+		{
+			AggregateInstrumentation *sinstrument;
+			uint64		hash_disk_used;
+			int			hash_batches_used;
+
+			sinstrument = &aggstate->shared_info->sinstrument[n];
+			/* Skip workers that didn't do anything */
+			if (sinstrument->hash_mem_peak == 0)
+				continue;
+			hash_disk_used = sinstrument->hash_disk_used;
+			hash_batches_used = sinstrument->hash_batches_used;
+			memPeakKb = (sinstrument->hash_mem_peak + 1023) / 1024;
+
+			if (es->workers_state)
+				ExplainOpenWorker(n, es);
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+
+				appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
+								 hash_batches_used, memPeakKb);
+
+				/* Only display disk usage if we spilled to disk */
+				if (hash_batches_used > 1)
+					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
+									 hash_disk_used);
+				appendStringInfoChar(es->str, '\n');
+			}
+			else
+			{
+				ExplainPropertyInteger("HashAgg Batches", NULL,
+									   hash_batches_used, es);
+				ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb,
+									   es);
+				ExplainPropertyInteger("Disk Usage", "kB", hash_disk_used, es);
+			}
+
+			if (es->workers_state)
+				ExplainCloseWorker(n, es);
+		}
+	}
+}
+
+/*
+ * If it's EXPLAIN ANALYZE, show exact/lossy pages for a BitmapHeapScan node
+ */
+static void
+show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
+{
+	if (es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		ExplainPropertyInteger("Exact Heap Blocks", NULL,
+							   planstate->exact_pages, es);
+		ExplainPropertyInteger("Lossy Heap Blocks", NULL,
+							   planstate->lossy_pages, es);
+	}
+	else
+	{
+		if (planstate->exact_pages > 0 || planstate->lossy_pages > 0)
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "Heap Blocks:");
+			if (planstate->exact_pages > 0)
+				appendStringInfo(es->str, " exact=%ld", planstate->exact_pages);
+			if (planstate->lossy_pages > 0)
+				appendStringInfo(es->str, " lossy=%ld", planstate->lossy_pages);
+			appendStringInfoChar(es->str, '\n');
+		}
+	}
+}
+
+/*
+ * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node
+ *
+ * "which" identifies which instrumentation counter to print
+ */
+static void
+show_instrumentation_count(const char *qlabel, int which,
+						   PlanState *planstate, ExplainState *es)
+{
+	double		nfiltered;
+	double		nloops;
+
+	if (!es->analyze || !planstate->instrument)
+		return;
+	if (which == 2)
+		nfiltered = planstate->instrument->nfiltered2;
+	else
+		nfiltered = planstate->instrument->nfiltered1;
+	nloops = planstate->instrument->nloops;
+
+	/* In text mode, suppress zero counts; they're not interesting enough */
+	if (nfiltered > 0 || es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		if (nloops > 0)
+			ExplainPropertyFloat(qlabel, NULL, nfiltered / nloops, 0, es);
+		else
+			ExplainPropertyFloat(qlabel, NULL, 0.0, 0, es);
+	}
+}
+
+/*
+ * Show extra information for a ForeignScan node.
+ */
+static void
+show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es)
+{
+	FdwRoutine *fdwroutine = fsstate->fdwroutine;
+
+	/* Let the FDW emit whatever fields it wants */
+	if (((ForeignScan *) fsstate->ss.ps.plan)->operation != CMD_SELECT)
+	{
+		if (fdwroutine->ExplainDirectModify != NULL)
+			fdwroutine->ExplainDirectModify(fsstate, es);
+	}
+	else
+	{
+		if (fdwroutine->ExplainForeignScan != NULL)
+			fdwroutine->ExplainForeignScan(fsstate, es);
+	}
+}
+
+/*
+ * Show initplan params evaluated at Gather or Gather Merge node.
+ */
+static void
+show_eval_params(Bitmapset *bms_params, ExplainState *es)
+{
+	int			paramid = -1;
+	List	   *params = NIL;
+
+	Assert(bms_params);
+
+	while ((paramid = bms_next_member(bms_params, paramid)) >= 0)
+	{
+		char		param[32];
+
+		snprintf(param, sizeof(param), "$%d", paramid);
+		params = lappend(params, pstrdup(param));
+	}
+
+	if (params)
+		ExplainPropertyList("Params Evaluated", params, es);
+}
+
+static void
+show_join_pruning_info(List *join_prune_ids, ExplainState *es)
+{
+	List	   *params = NIL;
+	ListCell   *lc;
+
+	if (!join_prune_ids)
+		return;
+
+	foreach(lc, join_prune_ids)
+	{
+		int			paramid = lfirst_int(lc);
+		char		param[32];
+
+		snprintf(param, sizeof(param), "$%d", paramid);
+		params = lappend(params, pstrdup(param));
+	}
+
+	ExplainPropertyList("Partition Selectors", params, es);
+}
+
+/*
+ * Fetch the name of an index in an EXPLAIN
+ *
+ * We allow plugins to get control here so that plans involving hypothetical
+ * indexes can be explained.
+ *
+ * Note: names returned by this function should be "raw"; the caller will
+ * apply quoting if needed.  Formerly the convention was to do quoting here,
+ * but we don't want that in non-text output formats.
+ */
+static const char *
+explain_get_index_name(Oid indexId)
+{
+	const char *result;
+
+	if (explain_get_index_name_hook)
+		result = (*explain_get_index_name_hook) (indexId);
+	else
+		result = NULL;
+	if (result == NULL)
+	{
+		/* default behavior: look it up in the catalogs */
+		result = get_rel_name(indexId);
+		if (result == NULL)
+			elog(ERROR, "cache lookup failed for index %u", indexId);
+	}
+	return result;
+}
+
+/*
+ * Show buffer usage details.
+ */
+static void
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
+{
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		bool		has_shared = (usage->shared_blks_hit > 0 ||
+								  usage->shared_blks_read > 0 ||
+								  usage->shared_blks_dirtied > 0 ||
+								  usage->shared_blks_written > 0);
+		bool		has_local = (usage->local_blks_hit > 0 ||
+								 usage->local_blks_read > 0 ||
+								 usage->local_blks_dirtied > 0 ||
+								 usage->local_blks_written > 0);
+		bool		has_temp = (usage->temp_blks_read > 0 ||
+								usage->temp_blks_written > 0);
+		bool		has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
+								  !INSTR_TIME_IS_ZERO(usage->blk_write_time));
+		bool		show_planning = (planning && (has_shared ||
+												  has_local || has_temp || has_timing));
+
+		if (show_planning)
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "Planning:\n");
+			es->indent++;
+		}
+
+		/* Show only positive counter values. */
+		if (has_shared || has_local || has_temp)
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "Buffers:");
+
+			if (has_shared)
+			{
+				appendStringInfoString(es->str, " shared");
+				if (usage->shared_blks_hit > 0)
+					appendStringInfo(es->str, " hit=%lld",
+									 (long long) usage->shared_blks_hit);
+				if (usage->shared_blks_read > 0)
+					appendStringInfo(es->str, " read=%lld",
+									 (long long) usage->shared_blks_read);
+				if (usage->shared_blks_dirtied > 0)
+					appendStringInfo(es->str, " dirtied=%lld",
+									 (long long) usage->shared_blks_dirtied);
+				if (usage->shared_blks_written > 0)
+					appendStringInfo(es->str, " written=%lld",
+									 (long long) usage->shared_blks_written);
+				if (has_local || has_temp)
+					appendStringInfoChar(es->str, ',');
+			}
+			if (has_local)
+			{
+				appendStringInfoString(es->str, " local");
+				if (usage->local_blks_hit > 0)
+					appendStringInfo(es->str, " hit=%lld",
+									 (long long) usage->local_blks_hit);
+				if (usage->local_blks_read > 0)
+					appendStringInfo(es->str, " read=%lld",
+									 (long long) usage->local_blks_read);
+				if (usage->local_blks_dirtied > 0)
+					appendStringInfo(es->str, " dirtied=%lld",
+									 (long long) usage->local_blks_dirtied);
+				if (usage->local_blks_written > 0)
+					appendStringInfo(es->str, " written=%lld",
+									 (long long) usage->local_blks_written);
+				if (has_temp)
+					appendStringInfoChar(es->str, ',');
+			}
+			if (has_temp)
+			{
+				appendStringInfoString(es->str, " temp");
+				if (usage->temp_blks_read > 0)
+					appendStringInfo(es->str, " read=%lld",
+									 (long long) usage->temp_blks_read);
+				if (usage->temp_blks_written > 0)
+					appendStringInfo(es->str, " written=%lld",
+									 (long long) usage->temp_blks_written);
+			}
+			appendStringInfoChar(es->str, '\n');
+		}
+
+		/* As above, show only positive counter values. */
+		if (has_timing)
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "I/O Timings:");
+			if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
+				appendStringInfo(es->str, " read=%0.3f",
+								 INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
+			if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
+				appendStringInfo(es->str, " write=%0.3f",
+								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+			appendStringInfoChar(es->str, '\n');
+		}
+
+		if (show_planning)
+			es->indent--;
+	}
+	else
+	{
+		ExplainPropertyInteger("Shared Hit Blocks", NULL,
+							   usage->shared_blks_hit, es);
+		ExplainPropertyInteger("Shared Read Blocks", NULL,
+							   usage->shared_blks_read, es);
+		ExplainPropertyInteger("Shared Dirtied Blocks", NULL,
+							   usage->shared_blks_dirtied, es);
+		ExplainPropertyInteger("Shared Written Blocks", NULL,
+							   usage->shared_blks_written, es);
+		ExplainPropertyInteger("Local Hit Blocks", NULL,
+							   usage->local_blks_hit, es);
+		ExplainPropertyInteger("Local Read Blocks", NULL,
+							   usage->local_blks_read, es);
+		ExplainPropertyInteger("Local Dirtied Blocks", NULL,
+							   usage->local_blks_dirtied, es);
+		ExplainPropertyInteger("Local Written Blocks", NULL,
+							   usage->local_blks_written, es);
+		ExplainPropertyInteger("Temp Read Blocks", NULL,
+							   usage->temp_blks_read, es);
+		ExplainPropertyInteger("Temp Written Blocks", NULL,
+							   usage->temp_blks_written, es);
+		if (track_io_timing)
+		{
+			ExplainPropertyFloat("I/O Read Time", "ms",
+								 INSTR_TIME_GET_MILLISEC(usage->blk_read_time),
+								 3, es);
+			ExplainPropertyFloat("I/O Write Time", "ms",
+								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time),
+								 3, es);
+		}
+	}
+}
+
+/*
+ * Show WAL usage details.
+ */
+static void
+show_wal_usage(ExplainState *es, const WalUsage *usage)
+{
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		/* Show only positive counter values. */
+		if ((usage->wal_records >= 0) || (usage->wal_fpi >= 0) ||
+			(usage->wal_bytes >= 0))
+		{
+			ExplainIndentText(es);
+			appendStringInfoString(es->str, "WAL:");
+
+			if (usage->wal_records > 0)
+				appendStringInfo(es->str, " records=%lld",
+								 (long long) usage->wal_records);
+			if (usage->wal_fpi > 0)
+				appendStringInfo(es->str, " fpi=%lld",
+								 (long long) usage->wal_fpi);
+			if (usage->wal_bytes > 0)
+				appendStringInfo(es->str, " bytes=" UINT64_FORMAT,
+								 usage->wal_bytes);
+			appendStringInfoChar(es->str, '\n');
+		}
+	}
+	else
+	{
+		ExplainPropertyInteger("WAL Records", NULL,
+							   usage->wal_records, es);
+		ExplainPropertyInteger("WAL FPI", NULL,
+							   usage->wal_fpi, es);
+		ExplainPropertyUInteger("WAL Bytes", NULL,
+								usage->wal_bytes, es);
+	}
+}
+
+/*
+ * Add some additional details about an IndexScan or IndexOnlyScan
+ */
+static void
+ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir,
+						ExplainState *es)
+{
+	const char *indexname = explain_get_index_name(indexid);
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		if (ScanDirectionIsBackward(indexorderdir))
+			appendStringInfoString(es->str, " Backward");
+		appendStringInfo(es->str, " using %s", quote_identifier(indexname));
+	}
+	else
+	{
+		const char *scandir;
+
+		switch (indexorderdir)
+		{
+			case BackwardScanDirection:
+				scandir = "Backward";
+				break;
+			case NoMovementScanDirection:
+				scandir = "NoMovement";
+				break;
+			case ForwardScanDirection:
+				scandir = "Forward";
+				break;
+			default:
+				scandir = "???";
+				break;
+		}
+		ExplainPropertyText("Scan Direction", scandir, es);
+		ExplainPropertyText("Index Name", indexname, es);
+	}
+}
+
+/*
+ * Show the target of a Scan node
+ */
+static void
+ExplainScanTarget(Scan *plan, ExplainState *es)
+{
+	ExplainTargetRel((Plan *) plan, plan->scanrelid, es);
+}
+
+/*
+ * Show the target of a ModifyTable node
+ *
+ * Here we show the nominal target (ie, the relation that was named in the
+ * original query).  If the actual target(s) is/are different, we'll show them
+ * in show_modifytable_info().
+ */
+static void
+ExplainModifyTarget(ModifyTable *plan, ExplainState *es)
+{
+	ExplainTargetRel((Plan *) plan, plan->nominalRelation, es);
+}
+
+/*
+ * Show the target relation of a scan or modify node
+ */
+static void
+ExplainTargetRel(Plan *plan, Index rti, ExplainState *es)
+{
+	char	   *objectname = NULL;
+	char	   *namespace = NULL;
+	const char *objecttag = NULL;
+	RangeTblEntry *rte;
+	char	   *refname;
+	int			dynamicScanId = 0;
+
+	rte = rt_fetch(rti, es->rtable);
+	refname = (char *) list_nth(es->rtable_names, rti - 1);
+	if (refname == NULL)
+		refname = rte->eref->aliasname;
+
+	switch (nodeTag(plan))
+	{
+		case T_SeqScan:
+		case T_SampleScan:
+		case T_IndexScan:
+		case T_IndexOnlyScan:
+		case T_BitmapHeapScan:
+		case T_TidScan:
+		case T_TidRangeScan:
+		case T_ForeignScan:
+		case T_CustomScan:
+		case T_ModifyTable:
+			/* Assert it's on a real relation */
+			Assert(rte->rtekind == RTE_RELATION);
+			objectname = get_rel_name(rte->relid);
+			if (es->verbose)
+				namespace = get_namespace_name(get_rel_namespace(rte->relid));
+			objecttag = "Relation Name";
+
+			break;
+		case T_FunctionScan:
+			{
+				FunctionScan *fscan = (FunctionScan *) plan;
+
+				/* Assert it's on a RangeFunction */
+				Assert(rte->rtekind == RTE_FUNCTION);
+
+				/*
+				 * If the expression is still a function call of a single
+				 * function, we can get the real name of the function.
+				 * Otherwise, punt.  (Even if it was a single function call
+				 * originally, the optimizer could have simplified it away.)
+				 */
+				if (list_length(fscan->functions) == 1)
+				{
+					RangeTblFunction *rtfunc = (RangeTblFunction *) linitial(fscan->functions);
+
+					if (IsA(rtfunc->funcexpr, FuncExpr))
+					{
+						FuncExpr   *funcexpr = (FuncExpr *) rtfunc->funcexpr;
+						Oid			funcid = funcexpr->funcid;
+
+						objectname = get_func_name(funcid);
+						if (es->verbose)
+							namespace =
+								get_namespace_name(get_func_namespace(funcid));
+					}
+				}
+				objecttag = "Function Name";
+			}
+			break;
+		case T_TableFunctionScan:
+			{
+				TableFunctionScan *fscan = (TableFunctionScan *) plan;
+
+				/* Assert it's on a RangeFunction */
+				Assert(rte->rtekind == RTE_TABLEFUNCTION);
+
+				/*
+				 * Unlike in a FunctionScan, in a TableFunctionScan the call
+				 * should always be a function call of a single function.
+				 * Get the real name of the function.
+				 */
+				{
+					RangeTblFunction *rtfunc = fscan->function;
+
+					if (IsA(rtfunc->funcexpr, FuncExpr))
+					{
+						FuncExpr   *funcexpr = (FuncExpr *) rtfunc->funcexpr;
+						Oid			funcid = funcexpr->funcid;
+
+						objectname = get_func_name(funcid);
+						if (es->verbose)
+							namespace =
+								get_namespace_name(get_func_namespace(funcid));
+					}
+				}
+				objecttag = "Function Name";
+
+				/* might be nice to add order by and scatter by info, if it's a TableFunctionScan */
+			}
+			break;
+		case T_TableFuncScan:
+			Assert(rte->rtekind == RTE_TABLEFUNC);
+			objectname = "xmltable";
+			objecttag = "Table Function Name";
+			break;
+		case T_ValuesScan:
+			Assert(rte->rtekind == RTE_VALUES);
+			break;
+		case T_CteScan:
+			/* Assert it's on a non-self-reference CTE */
+			Assert(rte->rtekind == RTE_CTE);
+			Assert(!rte->self_reference);
+			objectname = rte->ctename;
+			objecttag = "CTE Name";
+			break;
+		case T_NamedTuplestoreScan:
+			Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
+			objectname = rte->enrname;
+			objecttag = "Tuplestore Name";
+			break;
+		case T_WorkTableScan:
+			/* Assert it's on a self-reference CTE */
+			Assert(rte->rtekind == RTE_CTE);
+			Assert(rte->self_reference);
+			objectname = rte->ctename;
+			objecttag = "CTE Name";
+			break;
+		default:
+			break;
+	}
+
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		appendStringInfoString(es->str, " on");
+		if (namespace != NULL)
+			appendStringInfo(es->str, " %s.%s", quote_identifier(namespace),
+							 quote_identifier(objectname));
+		else if (objectname != NULL)
+			appendStringInfo(es->str, " %s", quote_identifier(objectname));
+		if (objectname == NULL || strcmp(refname, objectname) != 0)
+			appendStringInfo(es->str, " %s", quote_identifier(refname));
+
+		if (dynamicScanId != 0)
+			appendStringInfo(es->str, " (dynamic scan id: %d)",
+							 dynamicScanId);
+	}
+	else
+	{
+		if (objecttag != NULL && objectname != NULL)
+			ExplainPropertyText(objecttag, objectname, es);
+		if (namespace != NULL)
+			ExplainPropertyText("Schema", namespace, es);
+		ExplainPropertyText("Alias", refname, es);
+
+		if (dynamicScanId != 0)
+			ExplainPropertyInteger("Dynamic Scan Id", NULL, dynamicScanId, es);
+	}
+}
+
+/*
+ * Show extra information for a ModifyTable node
+ *
+ * We have three objectives here.  First, if there's more than one target
+ * table or it's different from the nominal target, identify the actual
+ * target(s).  Second, give FDWs a chance to display extra info about foreign
+ * targets.  Third, show information about ON CONFLICT.
+ */
+static void
+show_modifytable_info(ModifyTableState *mtstate, List *ancestors,
+					  ExplainState *es)
+{
+	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+	const char *operation;
+	const char *foperation;
+	bool		labeltargets;
+	int			j;
+	List	   *idxNames = NIL;
+	ListCell   *lst;
+
+	switch (node->operation)
+	{
+		case CMD_INSERT:
+			operation = "Insert";
+			foperation = "Foreign Insert";
+			break;
+		case CMD_UPDATE:
+			operation = "Update";
+			foperation = "Foreign Update";
+			break;
+		case CMD_DELETE:
+			operation = "Delete";
+			foperation = "Foreign Delete";
+			break;
+		default:
+			operation = "???";
+			foperation = "Foreign ???";
+			break;
+	}
+
+	/* Should we explicitly label target relations? */
+	labeltargets = (mtstate->mt_nrels > 1 ||
+					(mtstate->mt_nrels == 1 &&
+					 mtstate->resultRelInfo[0].ri_RangeTableIndex != node->nominalRelation));
+
+	if (labeltargets)
+		ExplainOpenGroup("Target Tables", "Target Tables", false, es);
+
+	for (j = 0; j < mtstate->mt_nrels; j++)
+	{
+		ResultRelInfo *resultRelInfo = mtstate->resultRelInfo + j;
+		FdwRoutine *fdwroutine = resultRelInfo->ri_FdwRoutine;
+
+		if (labeltargets)
+		{
+			/* Open a group for this target */
+			ExplainOpenGroup("Target Table", NULL, true, es);
+
+			/*
+			 * In text mode, decorate each target with operation type, so that
+			 * ExplainTargetRel's output of " on foo" will read nicely.
+			 */
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+				appendStringInfoString(es->str,
+									   fdwroutine ? foperation : operation);
+			}
+
+			/* Identify target */
+			ExplainTargetRel((Plan *) node,
+							 resultRelInfo->ri_RangeTableIndex,
+							 es);
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				appendStringInfoChar(es->str, '\n');
+				es->indent++;
+			}
+		}
+
+		/* Give FDW a chance if needed */
+		if (!resultRelInfo->ri_usesFdwDirectModify &&
+			fdwroutine != NULL &&
+			fdwroutine->ExplainForeignModify != NULL)
+		{
+			List	   *fdw_private = (List *) list_nth(node->fdwPrivLists, j);
+
+			fdwroutine->ExplainForeignModify(mtstate,
+											 resultRelInfo,
+											 fdw_private,
+											 j,
+											 es);
+		}
+
+		if (labeltargets)
+		{
+			/* Undo the indentation we added in text format */
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+				es->indent--;
+
+			/* Close the group */
+			ExplainCloseGroup("Target Table", NULL, true, es);
+		}
+	}
+
+	/* Gather names of ON CONFLICT arbiter indexes */
+	foreach(lst, node->arbiterIndexes)
+	{
+		char	   *indexname = get_rel_name(lfirst_oid(lst));
+
+		idxNames = lappend(idxNames, indexname);
+	}
+
+	if (node->onConflictAction != ONCONFLICT_NONE)
+	{
+		ExplainPropertyText("Conflict Resolution",
+							node->onConflictAction == ONCONFLICT_NOTHING ?
+							"NOTHING" : "UPDATE",
+							es);
+
+		/*
+		 * Don't display arbiter indexes at all when DO NOTHING variant
+		 * implicitly ignores all conflicts
+		 */
+		if (idxNames)
+			ExplainPropertyList("Conflict Arbiter Indexes", idxNames, es);
+
+		/* ON CONFLICT DO UPDATE WHERE qual is specially displayed */
+		if (node->onConflictWhere)
+		{
+			show_upper_qual((List *) node->onConflictWhere, "Conflict Filter",
+							&mtstate->ps, ancestors, es);
+			show_instrumentation_count("Rows Removed by Conflict Filter", 1, &mtstate->ps, es);
+		}
+
+		/* EXPLAIN ANALYZE display of actual outcome for each tuple proposed */
+		if (es->analyze && mtstate->ps.instrument)
+		{
+			double		total;
+			double		insert_path;
+			double		other_path;
+
+			if (!es->runtime)
+				InstrEndLoop(outerPlanState(mtstate)->instrument);
+
+			/* count the number of source rows */
+			other_path = mtstate->ps.instrument->nfiltered2;
+
+			/*
+			 * Insert occurs after extracting row from subplan and in runtime mode
+			 * we can appear between these two operations - situation when
+			 * total > insert_path + other_path. Therefore we don't know exactly
+			 * whether last row from subplan is inserted.
+			 * We don't print inserted tuples in runtime mode in order to not print
+			 * inconsistent data
+			 */
+			if (!es->runtime)
+			{
+				total = outerPlanState(mtstate)->instrument->ntuples;
+				insert_path = total - other_path;
+				ExplainPropertyFloat("Tuples Inserted", NULL, insert_path, 0, es);
+			}
+			ExplainPropertyFloat("Conflicting Tuples", NULL,
+								 other_path, 0, es);
+		}
+	}
+
+	if (labeltargets)
+		ExplainCloseGroup("Target Tables", "Target Tables", false, es);
+}
+
+/*
+ * Show the hash and merge keys for a Motion node.
+ */
+static void
+show_motion_keys(PlanState *planstate, List *hashExpr, int nkeys, AttrNumber *keycols,
+			     const char *qlabel, List *ancestors, ExplainState *es)
+{
+	Plan	   *plan = planstate->plan;
+	List	   *context;
+	char	   *exprstr;
+	bool		useprefix = list_length(es->rtable) > 1;
+	int			keyno;
+	List	   *result = NIL;
+
+	if (!nkeys && !hashExpr)
+		return;
+
+	/* Set up deparse context */
+	context = set_deparse_context_plan(es->deparse_cxt,
+									   plan,
+									   ancestors);
+
+    /* Merge Receive ordering key */
+    for (keyno = 0; keyno < nkeys; keyno++)
+    {
+	    /* find key expression in tlist */
+	    AttrNumber	keyresno = keycols[keyno];
+	    TargetEntry *target = get_tle_by_resno(plan->targetlist, keyresno);
+
+	    /* Deparse the expression, showing any top-level cast */
+	    if (target)
+	        exprstr = deparse_expression((Node *) target->expr, context,
+								         useprefix, true);
+        else
+        {
+            elog(WARNING, "Gather Motion %s error: no tlist item %d",
+                 qlabel, keyresno);
+            exprstr = "*BOGUS*";
+        }
+
+		result = lappend(result, exprstr);
+    }
+
+	if (list_length(result) > 0)
+		ExplainPropertyList(qlabel, result, es);
+
+    /* Hashed repartitioning key */
+    if (hashExpr)
+    {
+	    /* Deparse the expression */
+	    exprstr = deparse_expression((Node *)hashExpr, context, useprefix, true);
+		ExplainPropertyText("Hash Key", exprstr, es);
+    }
+}
+
+/*
+ * Explain a parallel retrieve cursor,
+ * indicate the endpoints exist on entry DB, or on some segments,
+ * or on all segments.
+ */
+void ExplainParallelRetrieveCursor(ExplainState *es, QueryDesc* queryDesc)
+{
+	PlannedStmt *plan = queryDesc->plannedstmt;
+	SliceTable *sliceTable = queryDesc->estate->es_sliceTable;
+	StringInfoData            endpointInfoStr;
+	enum EndPointExecPosition endPointExecPosition;
+
+	initStringInfo(&endpointInfoStr);
+
+	endPointExecPosition = GetParallelCursorEndpointPosition(plan);
+	ExplainOpenGroup("Cursor", "Cursor", true, es);
+	switch(endPointExecPosition)
+	{
+		case ENDPOINT_ON_ENTRY_DB:
+		{
+			appendStringInfo(&endpointInfoStr, "\"on coordinator\"");
+			break;
+		}
+		case ENDPOINT_ON_SINGLE_QE:
+		{
+			appendStringInfo(
+							 &endpointInfoStr, "\"on segment: contentid [%d]\"",
+							 gp_session_id % plan->planTree->flow->numsegments);
+			break;
+		}
+		case ENDPOINT_ON_SOME_QE:
+		{
+			ListCell * cell;
+			bool isFirst = true;
+			appendStringInfo(&endpointInfoStr, "on segments: contentid [");
+			ExecSlice *slice = &sliceTable->slices[0];
+			foreach(cell, slice->segments)
+			{
+				int contentid = lfirst_int(cell);
+				appendStringInfo(&endpointInfoStr, (isFirst)?"%d":", %d", contentid);
+				isFirst = false;
+			}
+			appendStringInfo(&endpointInfoStr, "]");
+			break;
+		}
+		case ENDPOINT_ON_ALL_QE:
+		{
+			appendStringInfo(&endpointInfoStr, "on all %d segments", getgpsegmentCount());
+			break;
+		}
+		default:
+		{
+			elog(ERROR, "invalid endpoint position : %d", endPointExecPosition);
+			break;
+		}
+	}
+	ExplainPropertyText("Endpoint", endpointInfoStr.data, es);
+	ExplainCloseGroup("Cursor", "Cursor", true, es);
+}
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index bfd5f98219c..06a332ece4b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -24,7 +24,6 @@
 #include "commands/queue.h"
 #include "executor/execUtils.h"
 #include "executor/hashjoin.h"
-#include "executor/nodeHash.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
 #include "nodes/extensible.h"
@@ -1009,15 +1008,17 @@ ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc)
 	 */
 	if (es->analyze && !es->showstatctx->stats_gathered)
 	{
+		/* when es->analyze is false, the showstatctx is NULL*/
+		es->showstatctx->runtime = es->runtime;
 		if (Gp_role != GP_ROLE_EXECUTE && (!es->currentSlice || sliceRunsOnQD(es->currentSlice)))
 			cdbexplain_localExecStats(queryDesc->planstate, es->showstatctx);
 
-        /* Fill in the plan's Instrumentation with stats from qExecs. */
-        if (estate->dispatcherState && estate->dispatcherState->primaryResults)
-            cdbexplain_recvExecStats(queryDesc->planstate,
-                                     estate->dispatcherState->primaryResults,
-                                     LocallyExecutingSliceIndex(estate),
-                                     es->showstatctx);
+		/* Fill in the plan's Instrumentation with stats from qExecs. */
+		if (estate->dispatcherState && estate->dispatcherState->primaryResults)
+			cdbexplain_recvExecStats(queryDesc->planstate,
+					estate->dispatcherState->primaryResults,
+					LocallyExecutingSliceIndex(estate),
+					es->showstatctx);
 	}
 
 	ExplainPreScanNode(queryDesc->planstate, &rels_used);
@@ -1313,8 +1314,11 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
 		char	   *relname;
 		char	   *conname = NULL;
 
-		/* Must clean up instrumentation state */
-		InstrEndLoop(instr);
+		if (!es->runtime)
+		{
+			/* Must clean up instrumentation state */
+			InstrEndLoop(instr);
+		}
 
 		/*
 		 * We ignore triggers that were never invoked; they likely aren't
@@ -2259,8 +2263,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	 * instrumentation results the user didn't ask for.  But we do the
 	 * InstrEndLoop call anyway, if possible, to reduce the number of cases
 	 * auto_explain has to contend with.
+	 *
+	 * If flag es->stateinfo is set, i.e. when printing the current execution
+	 * state, this step of cleaning up is missed.
 	 */
-	if (planstate->instrument)
+	if (planstate->instrument && !es->runtime)
 		InstrEndLoop(planstate->instrument);
 
 	/* GPDB_90_MERGE_FIXME: In GPDB, these are printed differently. But does that work
@@ -2297,7 +2304,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			ExplainPropertyFloat("Actual Loops", NULL, nloops, 0, es);
 		}
 	}
-	else if (es->analyze)
+	else if (es->analyze && !es->runtime)
 	{
 		if (es->format == EXPLAIN_FORMAT_TEXT)
 			appendStringInfoString(es->str, " (never executed)");
@@ -2313,6 +2320,90 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		}
 	}
 
+	/*
+	 * Print the progress of node execution at current loop.
+	 */
+	if (planstate->instrument && es->analyze && es->runtime)
+	{
+		instr_time	starttimespan;
+		double	startup_sec;
+		double	total_sec;
+		double	rows;
+		double	loop_num;
+		char 	*status;
+
+		if (!INSTR_TIME_IS_ZERO(planstate->instrument->rt_starttime))
+		{
+			INSTR_TIME_SET_CURRENT(starttimespan);
+			INSTR_TIME_SUBTRACT(starttimespan, planstate->instrument->rt_starttime);
+		}
+		else
+			INSTR_TIME_SET_ZERO(starttimespan);
+		startup_sec = 1000.0 * planstate->instrument->rt_firsttuple;
+		total_sec = 1000.0 * (INSTR_TIME_GET_DOUBLE(planstate->instrument->rt_counter)
+							+ INSTR_TIME_GET_DOUBLE(starttimespan));
+		loop_num = planstate->instrument->nloops + 1;
+		rows = planstate->instrument->rt_tuplecount / loop_num;
+
+		switch (planstate->instrument->nodeStatus)
+		{
+			case METRICS_PLAN_NODE_INITIALIZE:
+				status = &("Initialize"[0]);
+				break;
+			case METRICS_PLAN_NODE_EXECUTING:
+				status = &("Executing"[0]);
+				break;
+			case METRICS_PLAN_NODE_FINISHED:
+				status = &("Finished"[0]);
+				break;
+			default:
+				status = &("Unknown"[0]);
+				break;
+		}
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfo(es->str, " (node status: %s)", status);
+		}
+		else
+		{
+			ExplainPropertyText("Node status", status, es);
+		}
+
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			if (es->timing)
+			{
+				if (planstate->instrument->running)
+					appendStringInfo(es->str,
+									 " (actual time=%.3f..%.3f rows=%.0f, loops=%.0f)",
+									 startup_sec, total_sec, rows, loop_num);
+				else
+					appendStringInfo(es->str,
+									 " (actual time=%.3f rows=0, loops=%.0f)",
+									 total_sec, loop_num);
+			}
+			else
+				appendStringInfo(es->str,
+								 " (actual rows=%.0f, loops=%.0f)",
+								 rows, loop_num);
+		}
+		else
+		{
+			if (es->timing)
+			{
+				if (planstate->instrument->running)
+				{
+					ExplainPropertyFloat("Actual Startup Time", NULL, startup_sec, 3, es);
+					ExplainPropertyFloat("Actual Total Time", NULL, total_sec, 3, es);
+				}
+				else
+					ExplainPropertyFloat("Running Time", NULL, total_sec, 3, es);
+			}
+			ExplainPropertyFloat("Actual Rows", NULL, rows, 0, es);
+			ExplainPropertyFloat("Actual Loops", NULL, loop_num, 0, es);
+		}
+	}
+
 	/* in text format, first line ends here */
 	if (es->format == EXPLAIN_FORMAT_TEXT)
 		appendStringInfoChar(es->str, '\n');
@@ -2834,9 +2925,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			break;
 	}
 
-    /* Show executor statistics */
-	if (planstate->instrument && planstate->instrument->need_cdb)
+	/* Show executor statistics */
+	if (planstate->instrument && planstate->instrument->need_cdb && !es->runtime)
+	{
 		cdbexplain_showExecStats(planstate, es);
+	}
 
 	/*
 	 * Prepare per-worker JIT instrumentation.  As with the overall JIT
@@ -2865,7 +2958,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		show_wal_usage(es, &planstate->instrument->walusage);
 
 	/* Prepare per-worker buffer/WAL usage */
-	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
+	if (es->workers_state && (es->buffers || es->wal) && es->verbose && !es->runtime)
 	{
 		WorkerInstrumentation *w = planstate->worker_instrument;
 
@@ -3631,7 +3724,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 	if (!es->analyze)
 		return;
 
-	ns = ((PlanState *) sortstate)->instrument->cdbNodeSummary;
+	ns = es->runtime?  ((PlanState *) sortstate)->instrument->rt_cdbNodeSummary :((PlanState *) sortstate)->instrument->cdbNodeSummary;
 	if (!ns)
 		return;
 
@@ -3964,7 +4057,6 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 	if (hashstate->hinstrument)
 		memcpy(&hinstrument, hashstate->hinstrument,
 			   sizeof(HashInstrumentation));
-
 	/*
 	 * Merge results from workers.  In the parallel-oblivious case, the
 	 * results from all participants should be identical, except where
@@ -5025,15 +5117,27 @@ show_modifytable_info(ModifyTableState *mtstate, List *ancestors,
 			double		insert_path;
 			double		other_path;
 
-			InstrEndLoop(outerPlanState(mtstate)->instrument);
+			if (!es->runtime)
+				InstrEndLoop(outerPlanState(mtstate)->instrument);
 
 			/* count the number of source rows */
-			total = outerPlanState(mtstate)->instrument->ntuples;
 			other_path = mtstate->ps.instrument->ntuples2;
-			insert_path = total - other_path;
 
-			ExplainPropertyFloat("Tuples Inserted", NULL,
-								 insert_path, 0, es);
+			/*
+			 * Insert occurs after extracting row from subplan and in runtime mode
+			 * we can appear between these two operations - situation when
+			 * total > insert_path + other_path. Therefore we don't know exactly
+			 * whether last row from subplan is inserted.
+			 * We don't print inserted tuples in runtime mode in order to not print
+			 * inconsistent data
+			 */
+			if (!es->runtime)
+			{
+				total = outerPlanState(mtstate)->instrument->ntuples;
+				insert_path = total - other_path;
+				ExplainPropertyFloat("Tuples Inserted", NULL, insert_path, 0, es);
+			}
+
 			ExplainPropertyFloat("Conflicting Tuples", NULL,
 								 other_path, 0, es);
 		}
diff --git a/src/backend/commands/explain_gp.c b/src/backend/commands/explain_gp.c
index 27580fbd5fa..ed5b2b1bd48 100644
--- a/src/backend/commands/explain_gp.c
+++ b/src/backend/commands/explain_gp.c
@@ -28,6 +28,7 @@
 #include "cdb/cdbvars.h"		/* GpIdentity.segindex */
 #include "cdb/cdbendpoint.h"
 #include "cdb/memquota.h"
+#include "executor/nodeHash.h"
 #include "libpq/pqformat.h"		/* pq_beginmessage() etc. */
 #include "miscadmin.h"
 #include "utils/resscheduler.h"
@@ -73,6 +74,10 @@ typedef struct CdbExplain_StatInst
 	int			enotes;			/* Offset to end of node's extra text */
 	int 		nworkers_launched; /* Number of workers launched for this node */
 	WalUsage	walusage;		/* add WAL usage */
+
+	/* fields from Instrumentation struct for one cycle of a node */
+	double tuplecount;
+	QueryMetricsStatus nodeStatus; /*CDB: stauts*/
 } CdbExplain_StatInst;
 
 
@@ -126,6 +131,7 @@ typedef struct CdbExplain_NodeSummary
 {
 	/* Summary over all the node's workers */
 	CdbExplain_Agg ntuples;
+	CdbExplain_Agg runtime_tupleAgg; /* tuples of one loop, for runtime stat */
 	CdbExplain_Agg nloops;
 	CdbExplain_Agg execmemused;
 	CdbExplain_Agg workmemused;
@@ -184,6 +190,7 @@ typedef struct CdbExplain_ShowStatCtx
 	int			nslice;			/* num of slots in slices array */
 	CdbExplain_SliceSummary *slices;	/* -> array[0..nslice-1] of
 										 * SliceSummary */
+	bool 		runtime;
 } CdbExplain_ShowStatCtx;
 
 
@@ -193,6 +200,7 @@ typedef struct CdbExplain_SendStatCtx
 	StringInfoData *notebuf;
 	StringInfoData buf;
 	CdbExplain_StatHdr hdr;
+	bool runtime;
 } CdbExplain_SendStatCtx;
 
 
@@ -251,6 +259,7 @@ typedef struct CdbExplain_RecvStatCtx
 	/* Rollup of per-node stats over all of the slice's workers and nodes */
 	double		workmemused_max;
 	double		workmemwanted_max;
+	bool		runtime;
 } CdbExplain_RecvStatCtx;
 
 
@@ -260,6 +269,7 @@ typedef struct CdbExplain_LocalStatCtx
 	CdbExplain_SendStatCtx send;
 	CdbExplain_RecvStatCtx recv;
 	CdbExplain_StatHdr *msgptrs[1];
+	bool runtime;
 } CdbExplain_LocalStatCtx;
 
 
@@ -283,7 +293,11 @@ static void cdbexplain_depositSliceStats(CdbExplain_StatHdr *hdr,
 										 CdbExplain_RecvStatCtx *recvstatctx);
 static void cdbexplain_collectStatsFromNode(PlanState *planstate,
 											CdbExplain_SendStatCtx *ctx);
+static void
+cdbexplain_collectStatsFromNode_rt(PlanState *planstate, CdbExplain_SendStatCtx *ctx);
 static void cdbexplain_depositStatsToNode(PlanState *planstate,
+																																	 CdbExplain_RecvStatCtx *ctx);
+static void cdbexplain_depositStatsToNode_rt(PlanState *planstate,
 										  CdbExplain_RecvStatCtx *ctx);
 static int cdbexplain_collectExtraText(PlanState *planstate,
 									   StringInfo notebuf);
@@ -295,6 +309,8 @@ static void
 gpexplain_formatSlicesOutput(struct CdbExplain_ShowStatCtx *showstatctx,
                              struct EState *estate,
                              ExplainState *es);
+static StringInfo
+cdbexplain_getExecStats(QueryDesc *queryDesc, bool runtime);
 
 /*
  * cdbexplain_localExecStats
@@ -319,6 +335,7 @@ cdbexplain_localExecStats(struct PlanState *planstate,
 	Assert(planstate && planstate->instrument && showstatctx);
 
 	memset(&ctx, 0, sizeof(ctx));
+	ctx.runtime = showstatctx->runtime;
 
 	/* Set up send context area. */
 	ctx.send.notebuf = &showstatctx->extratextbuf;
@@ -338,6 +355,7 @@ cdbexplain_localExecStats(struct PlanState *planstate,
 	ctx.recv.dispatchResults = NULL;
 	ctx.recv.extratextbuf = NULL;
 	ctx.recv.showstatctx = showstatctx;
+	ctx.runtime = showstatctx->runtime;
 
 	/*
 	 * Collect and redeposit statistics from each PlanState node in this
@@ -349,8 +367,7 @@ cdbexplain_localExecStats(struct PlanState *planstate,
 	/* Obtain per-slice stats and put them in SliceSummary. */
 	cdbexplain_collectSliceStats(planstate, &ctx.send.hdr.worker);
 	cdbexplain_depositSliceStats(&ctx.send.hdr, &ctx.recv);
-}								/* cdbexplain_localExecStats */
-
+} /* cdbexplain_localExecStats */
 
 /*
  * cdbexplain_localStatWalker
@@ -359,6 +376,8 @@ static CdbVisitOpt
 cdbexplain_localStatWalker(PlanState *planstate, void *context)
 {
 	CdbExplain_LocalStatCtx *ctx = (CdbExplain_LocalStatCtx *) context;
+	ctx->send.runtime = ctx->runtime;
+	ctx->recv.runtime = ctx->runtime;
 
 	/* Collect stats into our temporary StatInst and caller's extratextbuf. */
 	cdbexplain_collectStatsFromNode(planstate, &ctx->send);
@@ -373,6 +392,27 @@ cdbexplain_localStatWalker(PlanState *planstate, void *context)
 	return CdbVisit_Walk;
 }								/* cdbexplain_localStatWalker */
 
+void
+cdbexplain_sendExecStats(QueryDesc *queryDesc)
+{
+	StringInfo buf = cdbexplain_getExecStats(queryDesc, false);
+	if (buf == NULL)
+		return;
+#ifdef FAULT_INJECTOR
+	/* Inject a fault before sending a message to qDisp process */
+	SIMPLE_FAULT_INJECTOR("send_exec_stats");
+#endif /* FAULT_INJECTOR */
+	/* Send message to qDisp process. */
+	buf->cursor = 'Y';
+	pq_endmessage(buf);
+	return;
+}
+
+StringInfo
+cdbexplain_getExecStats_runtime(QueryDesc *queryDesc)
+{
+	 return cdbexplain_getExecStats(queryDesc, true);
+}
 
 /*
  * cdbexplain_sendExecStats
@@ -380,13 +420,14 @@ cdbexplain_localStatWalker(PlanState *planstate, void *context)
  *	  On the qDisp, libpq will recognize our special message type ('Y') and
  *	  attach the message to the current command's PGresult object.
  */
-void
-cdbexplain_sendExecStats(QueryDesc *queryDesc)
+static StringInfo
+cdbexplain_getExecStats(QueryDesc *queryDesc, bool runtime)
 {
 	EState	   *estate;
 	PlanState  *planstate;
 	CdbExplain_SendStatCtx ctx;
 	StringInfoData notebuf;
+	StringInfo statBuf;
 
 	/* Header offset (where header begins in the message buffer) */
 	int			hoff;
@@ -395,7 +436,7 @@ cdbexplain_sendExecStats(QueryDesc *queryDesc)
 
 	if (!queryDesc ||
 		!queryDesc->estate)
-		return;
+		return NULL;
 
 	/* If executing a root slice (UPD/DEL/INS), start at top of plan tree. */
 	estate = queryDesc->estate;
@@ -412,8 +453,8 @@ cdbexplain_sendExecStats(QueryDesc *queryDesc)
 		planstate = planstate->lefttree;
 	}
 
-	if (planstate == NULL)
-		return;
+	if (planstate == NULL || planstate->instrument == NULL)
+		return NULL;
 
 	/* Start building the message header in our context area. */
 	memset(&ctx, 0, sizeof(ctx));
@@ -421,12 +462,12 @@ cdbexplain_sendExecStats(QueryDesc *queryDesc)
 	ctx.hdr.segindex = GpIdentity.segindex;
 	ctx.hdr.nInst = 0;
 
+	initStringInfo(&ctx.buf);
 	/* Allocate a separate buffer where nodes can append extra message text. */
 	initStringInfo(&notebuf);
 	ctx.notebuf = &notebuf;
 
-	/* Reserve buffer space for the message header (excluding 'inst' array). */
-	pq_beginmessage(&ctx.buf, 'Y');
+	ctx.runtime = runtime;
 
 	/* Where the actual StatHdr begins */
 	hoff = ctx.buf.len;
@@ -456,17 +497,12 @@ cdbexplain_sendExecStats(QueryDesc *queryDesc)
 	 * header
 	 */
 	memcpy(ctx.buf.data + hoff, (char *) &ctx.hdr, sizeof(ctx.hdr) - sizeof(ctx.hdr.inst));
-
-#ifdef FAULT_INJECTOR
-	/* Inject a fault before sending a message to qDisp process */
-	SIMPLE_FAULT_INJECTOR("send_exec_stats");
-#endif /* FAULT_INJECTOR */
-
-	/* Send message to qDisp process. */
-	pq_endmessage(&ctx.buf);
+	statBuf = makeStringInfo();
+	appendBinaryStringInfo(statBuf, ctx.buf.data, ctx.buf.len);
+	pfree(ctx.buf.data);
+	return statBuf;
 }								/* cdbexplain_sendExecStats */
 
-
 /*
  * cdbexplain_sendStatWalker
  */
@@ -490,7 +526,6 @@ cdbexplain_sendStatWalker(PlanState *planstate, void *context)
 	return CdbVisit_Walk;
 }								/* cdbexplain_sendStatWalker */
 
-
 /*
  * cdbexplain_recvExecStats
  *	  Called by qDisp to transfer a slice's EXPLAIN ANALYZE statistics
@@ -534,6 +569,7 @@ cdbexplain_recvExecStats(struct PlanState *planstate,
 	ctx.extratextbuf = &showstatctx->extratextbuf;
 	ctx.showstatctx = showstatctx;
 	ctx.sliceIndex = sliceIndex;
+	ctx.runtime = showstatctx->runtime;
 
 	/* Find the slice's CdbDispatchResult objects. */
 	dispatchResultBeg = cdbdisp_resultBegin(dispatchResults, sliceIndex);
@@ -570,12 +606,14 @@ cdbexplain_recvExecStats(struct PlanState *planstate,
 
 		/* Find this qExec's last PGresult.  If none, skip to next qExec. */
 		pgresult = cdbdisp_getPGresult(dispatchResult, -1);
-		if (!pgresult)
+		if (!pgresult || !pgresult->cdbstats)
 			continue;
 
 		/* Find our statistics in list of response messages.  If none, skip. */
 		for (statcell = pgresult->cdbstats; statcell; statcell = statcell->next)
 		{
+			if (!statcell || !statcell->data)
+				continue;
 			if (IsA((Node *) statcell->data, CdbExplain_StatHdr))
 				break;
 		}
@@ -665,7 +703,7 @@ cdbexplain_recvExecStats(struct PlanState *planstate,
 		if (iDispatch == 0)
 			ctx.nStatInst = hdr->nInst;
 		else
-			ctx.nStatInst = hdr->nInst < ctx.nStatInst ? hdr->nInst : ctx.nStatInst; 
+			ctx.nStatInst = hdr->nInst < ctx.nStatInst ? hdr->nInst : ctx.nStatInst;
 
 		/* Save lowest and highest segment id for which we have stats. */
 		if (iDispatch == 0)
@@ -744,7 +782,6 @@ cdbexplain_recvStatWalker(PlanState *planstate, void *context)
 	return CdbVisit_Walk;
 }								/* cdbexplain_recvStatWalker */
 
-
 /*
  * cdbexplain_collectSliceStats
  *	  Obtain per-slice statistical observations from the current slice
@@ -766,7 +803,6 @@ cdbexplain_collectSliceStats(PlanState *planstate,
 	out_worker->vmem_reserved = (double) VmemTracker_GetMaxReservedVmemBytes();
 }								/* cdbexplain_collectSliceStats */
 
-
 /*
  * cdbexplain_depositSliceStats
  *	  Transfer a worker's per-slice stats contribution from StatHdr into the
@@ -841,7 +877,6 @@ cdbexplain_depositSliceStats(CdbExplain_StatHdr *hdr,
 	showstatctx->workmemwanted_max = Max(showstatctx->workmemwanted_max, recvstatctx->workmemwanted_max);
 }								/* cdbexplain_depositSliceStats */
 
-
 /*
  * cdbexplain_collectStatsFromNode
  *
@@ -853,6 +888,8 @@ cdbexplain_depositSliceStats(CdbExplain_StatHdr *hdr,
 static void
 cdbexplain_collectStatsFromNode(PlanState *planstate, CdbExplain_SendStatCtx *ctx)
 {
+	if (ctx->runtime)
+		return cdbexplain_collectStatsFromNode_rt(planstate, ctx);
 	CdbExplain_StatInst *si = &ctx->hdr.inst[0];
 	Instrumentation *instr = planstate->instrument;
 
@@ -872,10 +909,8 @@ cdbexplain_collectStatsFromNode(PlanState *planstate, CdbExplain_SendStatCtx *ct
 	/* Make sure there is a '\0' between this node's message and the next. */
 	if (si->bnotes < si->enotes)
 		appendStringInfoChar(ctx->notebuf, '\0');
-
 	if (planstate->node_context)
-		si->execmemused = (double) MemoryContextGetPeakSpace(planstate->node_context);
-
+		si->execmemused = (double)MemoryContextGetPeakSpace(planstate->node_context);
 	/* Transfer this node's statistics from Instrumentation into StatInst. */
 	si->starttime = instr->starttime;
 	si->counter = instr->counter;
@@ -894,7 +929,7 @@ cdbexplain_collectStatsFromNode(PlanState *planstate, CdbExplain_SendStatCtx *ct
 	si->workfileCreated = instr->workfileCreated;
 	si->firststart = instr->firststart;
 	si->numPartScanned = instr->numPartScanned;
-        memcpy(&si->walusage, &instr->walusage, sizeof(WalUsage));
+	memcpy(&si->walusage, &instr->walusage, sizeof(WalUsage));
 
 	if (IsA(planstate, SortState))
 	{
@@ -935,8 +970,89 @@ cdbexplain_collectStatsFromNode(PlanState *planstate, CdbExplain_SendStatCtx *ct
 		si->nworkers_launched = gathermergestate->nworkers_launched;
 	}
 #endif
-}								/* cdbexplain_collectStatsFromNode */
+}							/* cdbexplain_collectStatsFromNode */
 
+/*
+ * cdbexplain_collectStatsFromNode_rt
+ *
+ * Called by sendStatWalker and localStatWalker to obtain a node's statistics
+ * and transfer them into the temporary StatHdr and StatInst in the SendStatCtx.
+ * Also obtains the node's extra message text, which it appends to the caller's
+ * cxt->nodebuf.
+ */
+static void
+cdbexplain_collectStatsFromNode_rt(PlanState *planstate, CdbExplain_SendStatCtx *ctx)
+{
+	CdbExplain_StatInst *si = &ctx->hdr.inst[0];
+	Instrumentation *instr = planstate->instrument;
+
+	Assert(instr);
+	if (instr == NULL)
+		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("instrumentation not initialized")));
+	/* Initialize the StatInst slot in the temporary StatHdr. */
+	memset(si, 0, sizeof(*si));
+	si->pstype = planstate->type;
+
+	/* Transfer this node's statistics from Instrumentation into StatInst. */
+	si->starttime = instr->starttime;
+	si->counter = instr->counter;
+	si->firsttuple = instr->firsttuple;
+	si->startup = instr->startup;
+	si->total = instr->total;
+	si->tuplecount = instr->tuplecount;
+	si->ntuples = instr->ntuples;
+	si->ntuples2 = instr->ntuples2;
+	si->nloops = instr->nloops;
+	si->nfiltered1 = instr->nfiltered1;
+	si->nfiltered2 = instr->nfiltered2;
+	si->workmemused = instr->workmemused;
+	si->workmemwanted = instr->workmemwanted;
+	si->workfileCreated = instr->workfileCreated;
+	si->firststart = instr->firststart;
+	si->numPartScanned = instr->numPartScanned;
+        memcpy(&si->walusage, &instr->walusage, sizeof(WalUsage));
+	si->nodeStatus = instr->nodeStatus;
+
+	if (IsA(planstate, SortState))
+	{
+		SortState *sortstate = (SortState *) planstate;
+
+		si->sortstats = sortstate->sortstats;
+	}
+	if (IsA(planstate, HashState))
+	{
+		HashState *hashstate = (HashState *) planstate;
+
+		if (hashstate->hashtable)
+			ExecHashAccumInstrumentation(&si->hashstats, hashstate->hashtable);
+	}
+	if (IsA(planstate, IncrementalSortState))
+	{
+		IncrementalSortState *incrementalstate = (IncrementalSortState*) planstate;
+
+		memcpy(&si->fullsortGroupInfo,
+			   &incrementalstate->incsort_info.fullsortGroupInfo,
+			   sizeof(IncrementalSortGroupInfo));
+
+		memcpy(&si->prefixsortGroupInfo,
+			   &incrementalstate->incsort_info.prefixsortGroupInfo,
+			   sizeof(IncrementalSortGroupInfo));
+	}
+#if 0
+	if (IsA(planstate, GatherState))
+	{
+		GatherState *gatherstate = (GatherState *) planstate;
+
+		si->nworkers_launched = gatherstate->nworkers_launched;
+	}
+	if (IsA(planstate, GatherMergeState))
+	{
+		GatherMergeState *gathermergestate = (GatherMergeState *) planstate;
+
+		si->nworkers_launched = gathermergestate->nworkers_launched;
+	}
+#endif
+}	
 
 /*
  * CdbExplain_DepStatAcc
@@ -1020,7 +1136,6 @@ cdbexplain_depStatAcc_saveText(CdbExplain_DepStatAcc *acc,
 	}
 }								/* cdbexplain_depStatAcc_saveText */
 
-
 /*
  * cdbexplain_depositStatsToNode
  *
@@ -1033,6 +1148,9 @@ cdbexplain_depStatAcc_saveText(CdbExplain_DepStatAcc *acc,
 static void
 cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 {
+
+	if (ctx->runtime)
+		return cdbexplain_depositStatsToNode_rt(planstate, ctx); 
 	Instrumentation *instr = planstate->instrument;
 	CdbExplain_StatHdr *rsh;	/* The header (which includes StatInst) */
 	CdbExplain_StatInst *rsi = NULL;	/* The current StatInst */
@@ -1053,6 +1171,7 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 	CdbExplain_NodeSummary *ns;
 	CdbExplain_DepStatAcc ntuples;
 	CdbExplain_DepStatAcc nloops;
+	CdbExplain_DepStatAcc runtime_tupleAgg;
 	CdbExplain_DepStatAcc execmemused;
 	CdbExplain_DepStatAcc workmemused;
 	CdbExplain_DepStatAcc workmemwanted;
@@ -1080,6 +1199,7 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 	/* Initialize per-node accumulators. */
 	cdbexplain_depStatAcc_init0(&ntuples);
 	cdbexplain_depStatAcc_init0(&nloops);
+	cdbexplain_depStatAcc_init0(&runtime_tupleAgg);
 	cdbexplain_depStatAcc_init0(&execmemused);
 	cdbexplain_depStatAcc_init0(&workmemused);
 	cdbexplain_depStatAcc_init0(&workmemwanted);
@@ -1103,7 +1223,6 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 		/* Locate PlanState node's StatInst received from this qExec. */
 		rsh = ctx->msgptrs[imsgptr];
 		rsi = &rsh->inst[ctx->iStatInst];
-
 		Assert(rsi->pstype == planstate->type &&
 			   ns->segindex0 <= rsh->segindex &&
 			   rsh->segindex < ns->segindex0 + ns->ninst);
@@ -1125,6 +1244,7 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 		/* Update per-node accumulators. */
 		cdbexplain_depStatAcc_upd(&ntuples, rsi->ntuples, rsh, rsi, nsi);
 		cdbexplain_depStatAcc_upd(&nloops, rsi->nloops, rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&runtime_tupleAgg, rsi->tuplecount, rsh, rsi, nsi);
 		cdbexplain_depStatAcc_upd(&execmemused, rsi->execmemused, rsh, rsi, nsi);
 		cdbexplain_depStatAcc_upd(&workmemused, rsi->workmemused, rsh, rsi, nsi);
 		cdbexplain_depStatAcc_upd(&workmemwanted, rsi->workmemwanted, rsh, rsi, nsi);
@@ -1152,6 +1272,7 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 	/* Save per-node accumulated stats in NodeSummary. */
 	ns->ntuples = ntuples.agg;
 	ns->nloops = nloops.agg;
+	ns->runtime_tupleAgg = runtime_tupleAgg.agg;
 	ns->execmemused = execmemused.agg;
 	ns->workmemused = workmemused.agg;
 	ns->workmemwanted = workmemwanted.agg;
@@ -1171,7 +1292,6 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 
 	instr->total = ntuples.max_total;
 	INSTR_TIME_ASSIGN(instr->firststart, ntuples.firststart_of_max_total);
-
 	/*
 	 * Put winner's stats into qDisp PlanState's Instrument node.
 	 */
@@ -1235,7 +1355,6 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 			ntuples.agg.vmax > 1.05 * cdbexplain_agg_avg(&ntuples.agg))
 			cdbexplain_depStatAcc_saveText(&ntuples, ctx->extratextbuf, &saved);
 	}
-
 	/*
 	 * If this is a HashState, construct a SharedHashInfo with the stats from
 	 * all the QEs. In PostgreSQL, SharedHashInfo is used to show stats of all
@@ -1295,7 +1414,216 @@ cdbexplain_depositStatsToNode(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
 	}
 }								/* cdbexplain_depositStatsToNode */
 
+/*
+ * cdbexplain_depositStatsToNode_rt
+ *
+ * Called by recvStatWalker and localStatWalker to update the given
+ * PlanState node's Instrument node with statistics received from
+ * workers or collected locally.  Attaches a CdbExplain_NodeSummary
+ * block to the Instrument node.  If top node of slice, per-slice
+ * statistics are transferred from the StatHdr to the SliceSummary.
+ */
+static void
+cdbexplain_depositStatsToNode_rt(PlanState *planstate, CdbExplain_RecvStatCtx *ctx)
+{
+	Instrumentation *instr = planstate->instrument;
+	CdbExplain_StatHdr *rsh;	/* The header (which includes StatInst) */
+	CdbExplain_StatInst *rsi = NULL;	/* The current StatInst */	
+	/*
+	 * Points to the insts array of node summary (CdbExplain_NodeSummary).
+	 * Used for saving every rsi in the node summary (in addition to saving
+	 * the max/avg).
+	 */
+	CdbExplain_StatInst *nsi;
+
+	/*
+	 * ns is the node summary across all QEs of the segworker group. It also
+	 * contains detailed "unsummarized" raw stat for a node across all QEs in
+	 * current segworker group (in the insts array)
+	 */
+	CdbExplain_NodeSummary *ns;
+	CdbExplain_DepStatAcc ntuples;
+	CdbExplain_DepStatAcc runtime_tupleAgg;
+	CdbExplain_DepStatAcc execmemused;
+	CdbExplain_DepStatAcc workmemused;
+	CdbExplain_DepStatAcc workmemwanted;
+	CdbExplain_DepStatAcc totalWorkfileCreated;
+	CdbExplain_DepStatAcc peakmemused;
+	CdbExplain_DepStatAcc vmem_reserved;
+	CdbExplain_DepStatAcc totalPartTableScanned;
+	CdbExplain_DepStatAcc sortSpaceUsed[NUM_SORT_SPACE_TYPE][NUM_SORT_METHOD];
+	int			imsgptr;
+	int			nInst;
+	QueryMetricsStatus nodeStatus = METRICS_PLAN_NODE_UNKNOWN;
+
+	if (!(instr && ctx->iStatInst < ctx->nStatInst))
+		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("cdbexplain_depositStatsToNode_rt: invalid iStatInst %d, nStatInst %d",
+						ctx->iStatInst, ctx->nStatInst)));
+
+	/* Allocate NodeSummary block. */
+	nInst = ctx->segindexMax + 1 - ctx->segindexMin;
+	ns = (CdbExplain_NodeSummary *) palloc0(sizeof(*ns) - sizeof(ns->insts) +
+											nInst * sizeof(ns->insts[0]));
+	ns->segindex0 = ctx->segindexMin;
+	ns->ninst = nInst;
+
+	/* Attach our new NodeSummary to the Instrumentation node. */
+	instr->rt_cdbNodeSummary = ns;
+
+	/* Initialize per-node accumulators. */
+	cdbexplain_depStatAcc_init0(&ntuples);
+	cdbexplain_depStatAcc_init0(&runtime_tupleAgg);
+	cdbexplain_depStatAcc_init0(&execmemused);
+	cdbexplain_depStatAcc_init0(&workmemused);
+	cdbexplain_depStatAcc_init0(&workmemwanted);
+	cdbexplain_depStatAcc_init0(&totalWorkfileCreated);
+	cdbexplain_depStatAcc_init0(&totalPartTableScanned);
+	for (int i = 0; i < NUM_SORT_METHOD; i++)
+	{
+		for (int j = 0; j < NUM_SORT_SPACE_TYPE; j++)
+		{
+			cdbexplain_depStatAcc_init0(&sortSpaceUsed[j][i]);
+		}
+	}
+
+	/* Initialize per-slice accumulators. */
+	cdbexplain_depStatAcc_init0(&peakmemused);
+	cdbexplain_depStatAcc_init0(&vmem_reserved);
+
+	/* Examine the statistics from each qExec. */
+	for (imsgptr = 0; imsgptr < ctx->nmsgptr; imsgptr++)
+	{
+		/* Locate PlanState node's StatInst received from this qExec. */
+		rsh = ctx->msgptrs[imsgptr];
+		rsi = &rsh->inst[ctx->iStatInst];
+
+		if (!(rsi->pstype == planstate->type &&
+			  ns->segindex0 <= rsh->segindex &&
+			  rsh->segindex < ns->segindex0 + ns->ninst))
+		{
+			ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
+							errmsg("cannot depositStatsToNode runtime: pstype %d, planstate type %d",
+							rsi->pstype, planstate->type)));
+		}
+
+		/* Locate this qExec's StatInst slot in node's NodeSummary block. */
+		nsi = &ns->insts[rsh->segindex - ns->segindex0];
+
+		/* Copy the StatInst to NodeSummary from dispatch result buffer. */
+		*nsi = *rsi;
 
+		/*
+		 * Drop qExec's extra text.  We rescue it below if qExec is a winner.
+		 * For local qDisp slice, ctx->extratextbuf is NULL, which tells us to
+		 * leave the extra text undisturbed in its existing buffer.
+		 */
+		if (ctx->extratextbuf)
+			nsi->bnotes = nsi->enotes = 0;
+
+		/* Update per-node accumulators. */
+		cdbexplain_depStatAcc_upd(&ntuples, rsi->ntuples, rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&runtime_tupleAgg, rsi->tuplecount, rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&execmemused, rsi->execmemused, rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&workmemused, rsi->workmemused, rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&workmemwanted, rsi->workmemwanted, rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&totalWorkfileCreated, (rsi->workfileCreated ? 1 : 0), rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&totalPartTableScanned, rsi->numPartScanned, rsh, rsi, nsi);
+		Assert(rsi->sortstats.sortMethod < NUM_SORT_METHOD);
+		Assert(rsi->sortstats.spaceType < NUM_SORT_SPACE_TYPE);
+		if (rsi->sortstats.sortMethod != SORT_TYPE_STILL_IN_PROGRESS)
+		{
+			cdbexplain_depStatAcc_upd(&sortSpaceUsed[rsi->sortstats.spaceType][rsi->sortstats.sortMethod],
+									  (double) rsi->sortstats.spaceUsed, rsh, rsi, nsi);
+		}
+
+		/* Update per-slice accumulators. */
+		cdbexplain_depStatAcc_upd(&peakmemused, rsh->worker.peakmemused, rsh, rsi, nsi);
+		cdbexplain_depStatAcc_upd(&vmem_reserved, rsh->worker.vmem_reserved, rsh, rsi, nsi);
+#if 0
+		if (IsA(planstate, GatherState) || IsA(planstate, GatherMergeState))
+		{
+			rsh->worker.nworkers_launched = nsi->nworkers_launched;
+		}
+#endif
+		/* Update nodeStatus
+		 * If nodeStatus is METRICS_PLAN_NODE_UNKNOWN, then nodeStatus is rsi->nodeStatus.
+		 * If nodeStatus is METRICS_PLAN_NODE_INITIALIZE and rsi->nodeStatus is METRICS_PLAN_NODE_EXECUTING,
+		 * then nodeStatus is METRICS_PLAN_NODE_EXECUTING.
+		 * If nodeStatus is METRICS_PLAN_NODE_EXECUTING and rsi->nodeStatus is METRICS_PLAN_NODE_FINISHED,
+		 * then nodeStatus is METRICS_PLAN_NODE_EXECUTING.
+		 */
+		if (nodeStatus == METRICS_PLAN_NODE_UNKNOWN || rsi->nodeStatus == METRICS_PLAN_NODE_EXECUTING)
+			nodeStatus = rsi->nodeStatus;
+		else if (nodeStatus != METRICS_PLAN_NODE_EXECUTING)
+		{
+			nodeStatus = rsi->nodeStatus < nodeStatus ? rsi->nodeStatus : nodeStatus;
+		}
+	}
+
+	/* Save per-node accumulated stats in NodeSummary. */
+	ns->ntuples = ntuples.agg;
+	ns->runtime_tupleAgg = runtime_tupleAgg.agg;
+	ns->execmemused = execmemused.agg;
+	ns->workmemused = workmemused.agg;
+	ns->workmemwanted = workmemwanted.agg;
+	ns->totalWorkfileCreated = totalWorkfileCreated.agg;
+	ns->totalPartTableScanned = totalPartTableScanned.agg;
+	for (int i = 0; i < NUM_SORT_METHOD; i++)
+	{
+		for (int j = 0; j < NUM_SORT_SPACE_TYPE; j++)
+		{
+			ns->sortSpaceUsed[j][i] = sortSpaceUsed[j][i].agg;
+		}
+	}
+
+	/* Roll up summary over all nodes of slice into RecvStatCtx. */
+	ctx->workmemused_max = Max(ctx->workmemused_max, workmemused.agg.vmax);
+	ctx->workmemwanted_max = Max(ctx->workmemwanted_max, workmemwanted.agg.vmax);
+
+	instr->total = ntuples.max_total;
+	if (nodeStatus != METRICS_PLAN_NODE_UNKNOWN)
+		instr->nodeStatus = nodeStatus;
+	if (ctx->runtime)
+		INSTR_TIME_ASSIGN(instr->firststart, runtime_tupleAgg.firststart_of_max_total);
+	else
+		INSTR_TIME_ASSIGN(instr->firststart, ntuples.firststart_of_max_total);
+	/* Put winner's stats into qDisp PlanState's Instrument node. */
+	/*
+	 * GPDB_12_MERGE_FIXME: does it make sense to also print 'nfiltered1'
+	 * 'nfiltered2' from the "winner", i.e. the QE that returned most rows?
+	 * There's this test case in the upstream 'partition_prune' test:
+	 *
+	 * explain (analyze, costs off, summary off, timing off) select * from list_part where a = list_part_fn(1) + a;
+	 *                       QUERY PLAN                      
+	 * ------------------------------------------------------
+	 *  Append (actual rows=0 loops=1)
+	 *    ->  Seq Scan on list_part1 (actual rows=0 loops=1)
+	 *          Filter: (a = (list_part_fn(1) + a))
+	 *          Rows Removed by Filter: 1
+	 *    ->  Seq Scan on list_part2 (actual rows=0 loops=1)
+	 *          Filter: (a = (list_part_fn(1) + a))
+	 *          Rows Removed by Filter: 1
+	 *    ->  Seq Scan on list_part3 (actual rows=0 loops=1)
+	 *          Filter: (a = (list_part_fn(1) + a))
+	 *          Rows Removed by Filter: 1
+	 *    ->  Seq Scan on list_part4 (actual rows=0 loops=1)
+	 *          Filter: (a = (list_part_fn(1) + a))
+	 *          Rows Removed by Filter: 1
+	 * (13 rows)
+	 *
+	 * We don't print those "Rows Removed by Filter" rows in GPDB, because
+	 * they don't come from the "winner" QE.
+	 */
+	if (runtime_tupleAgg.agg.vcnt > 0)
+	{
+		instr->rt_starttime = runtime_tupleAgg.nsimax->starttime;
+		instr->rt_counter = runtime_tupleAgg.nsimax->counter;
+		instr->rt_firsttuple = runtime_tupleAgg.nsimax->firsttuple;
+		instr->rt_tuplecount = runtime_tupleAgg.nsimax->tuplecount;
+		/* nloops don't change during runtime */
+		instr->nloops = runtime_tupleAgg.nsimax->nloops;
+	}
+}
 /*
  * cdbexplain_collectExtraText
  *	  Allow a node to supply additional text for its EXPLAIN ANALYZE report.
@@ -1560,7 +1888,7 @@ cdbexplain_showExecStats(struct PlanState *planstate, ExplainState *es)
 {
 	struct CdbExplain_ShowStatCtx *ctx = es->showstatctx;
 	Instrumentation *instr = planstate->instrument;
-	CdbExplain_NodeSummary *ns = instr->cdbNodeSummary;
+	CdbExplain_NodeSummary *ns = es->runtime? instr->rt_cdbNodeSummary: instr->cdbNodeSummary;
 	instr_time	timediff;
 	int			i;
 
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 12561e0c051..7b35c893483 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -116,6 +116,10 @@ InstrStopNodeSync(Instrumentation *instr, uint64 nTuples)
 
 	/* count the returned tuples */
 	instr->tuplecount += nTuples;
+	if (nTuples == 0)
+	{
+		instr->nodeStatus = METRICS_PLAN_NODE_FINISHED;
+	}
 
 	/* let's update the time only if the timer was requested */
 	if (instr->need_timer)
@@ -145,6 +149,7 @@ InstrStopNodeSync(Instrumentation *instr, uint64 nTuples)
 		instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
 		/* CDB: save this start time as the first start */
 		instr->firststart = starttime;
+		instr->nodeStatus = METRICS_PLAN_NODE_EXECUTING;
 	}
 }
 
@@ -414,6 +419,7 @@ GpInstrAlloc(const Plan *node, int instrument_options, bool async_mode)
 	if (instr == NULL)
 		instr = InstrAlloc(1, instrument_options, async_mode);
 
+	instr->nodeStatus = METRICS_PLAN_NODE_INITIALIZE;
 	return instr;
 }
 
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 669b5465d73..727b9424af7 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -6,6 +6,7 @@
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2021, Postgres Professional
  *
  * IDENTIFICATION
  *	  src/backend/storage/ipc/procsignal.c
@@ -100,6 +101,13 @@ typedef struct
 	((flags) &= ~(((uint32) 1) << (uint32) (type)))
 
 static ProcSignalHeader *ProcSignal = NULL;
+#define IsCustomProcSignalReason(reason) \
+	((reason) >= PROCSIG_CUSTOM_1 && (reason) <= PROCSIG_CUSTOM_N)
+
+static bool CustomSignalPendings[NUM_CUSTOM_PROCSIGNALS];
+static bool CustomSignalProcessing[NUM_CUSTOM_PROCSIGNALS];
+static ProcSignalHandler_type CustomInterruptHandlers[NUM_CUSTOM_PROCSIGNALS];
+
 static ProcSignalSlot *MyProcSignalSlot = NULL;
 
 static bool CheckProcSignal(ProcSignalReason reason);
@@ -107,6 +115,8 @@ static void CleanupProcSignalState(int status, Datum arg);
 static void ResetProcSignalBarrierBits(uint32 flags);
 static bool ProcessBarrierPlaceholder(void);
 
+static void CheckAndSetCustomSignalInterrupts(void);
+
 /*
  * ProcSignalShmemSize
  *		Compute space needed for procsignal's shared memory
@@ -250,6 +260,36 @@ CleanupProcSignalState(int status, Datum arg)
 	slot->pss_pid = 0;
 }
 
+/*
+ * RegisterCustomProcSignalHandler
+ *		Assign specific handler of custom process signal with new
+ *		ProcSignalReason key.
+ *
+ * This function has to be called in _PG_init function of extensions at the
+ * stage of loading shared preloaded libraries. Otherwise it throws fatal error.
+ *
+ * Return INVALID_PROCSIGNAL if all slots for custom signals are occupied.
+ */
+ProcSignalReason
+RegisterCustomProcSignalHandler(ProcSignalHandler_type handler)
+{
+	ProcSignalReason reason;
+
+	if (!process_shared_preload_libraries_in_progress)
+		ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR),
+						errmsg("cannot register custom signal after startup")));
+
+	/* Iterate through custom signal slots to find a free one */
+	for (reason = PROCSIG_CUSTOM_1; reason <= PROCSIG_CUSTOM_N; reason++)
+		if (!CustomInterruptHandlers[reason - PROCSIG_CUSTOM_1])
+		{
+			CustomInterruptHandlers[reason - PROCSIG_CUSTOM_1] = handler;
+			return reason;
+		}
+
+	return INVALID_PROCSIGNAL;
+}
+
 /*
  * SendProcSignal
  *		Send a signal to a Postgres process
@@ -707,8 +747,73 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
 
 	if (CheckProcSignal(PROCSIG_FAILED_LOGIN))
 		HandleLoginFailed();
+
+	CheckAndSetCustomSignalInterrupts();
 	
 	SetLatch(MyLatch);
 
 	errno = save_errno;
 }
+
+/*
+ * Handle receipt of an interrupt indicating any of custom process signals.
+ */
+static void
+CheckAndSetCustomSignalInterrupts()
+{
+	ProcSignalReason	reason;
+
+	for (reason = PROCSIG_CUSTOM_1; reason <= PROCSIG_CUSTOM_N; reason++)
+	{
+		if (CheckProcSignal(reason))
+		{
+			/* set interrupt flags */
+			InterruptPending = true;
+			CustomSignalPendings[reason - PROCSIG_CUSTOM_1] = true;
+		}
+	}
+
+	SetLatch(MyLatch);
+}
+
+/*
+ * CheckAndHandleCustomSignals
+ *		Check custom signal flags and call handler assigned to that signal
+ *		if it is not NULL
+ *
+ * This function is called within CHECK_FOR_INTERRUPTS if interrupt occurred.
+ */
+void
+CheckAndHandleCustomSignals(void)
+{
+	int i;
+
+	/*
+	 * This is invoked from ProcessInterrupts(), and since some of the
+	 * functions it calls contain CHECK_FOR_INTERRUPTS(), there is a potential
+	 * for recursive calls if more signals are received while this runs, so
+	 * let's block interrupts until done.
+	 */
+	HOLD_INTERRUPTS();
+
+	/* Check on expiring of custom signals and call its handlers if exist */
+	for (i = 0; i < NUM_CUSTOM_PROCSIGNALS; i++)
+	{
+		if (!CustomSignalProcessing[i] && CustomSignalPendings[i])
+		{
+			ProcSignalHandler_type  handler;
+
+			CustomSignalPendings[i] = false;
+			handler = CustomInterruptHandlers[i];
+			if (handler != NULL)
+			{
+				CustomSignalProcessing[i] = true;
+
+				handler();
+				CustomSignalProcessing[i] = false;
+			}
+		}
+	}
+
+	RESUME_INTERRUPTS();
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index f29c9c2e606..c1e561bd888 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -81,6 +81,7 @@
 #include "tcop/pquery.h"
 #include "tcop/tcopprot.h"
 #include "tcop/utility.h"
+#include "utils/builtins.h"
 #include "utils/backend_cancel.h"
 #include "utils/faultinjector.h"
 #include "utils/lsyscache.h"
@@ -4342,12 +4343,12 @@ ProcessInterrupts(const char* filename, int lineno)
 
 	if (ProcSignalBarrierPending)
 		ProcessProcSignalBarrier();
-
 	if (ParallelMessagePending)
 		HandleParallelMessages();
-
 	if (LogMemoryContextPending)
 		ProcessLogMemoryContextInterrupt();
+
+	CheckAndHandleCustomSignals();
 }
 
 /*
diff --git a/src/include/cdb/cdbexplain.h b/src/include/cdb/cdbexplain.h
index 959cb1faec0..bc588b57bd9 100644
--- a/src/include/cdb/cdbexplain.h
+++ b/src/include/cdb/cdbexplain.h
@@ -16,6 +16,7 @@
 #define CDBEXPLAIN_H
 
 #include "executor/instrument.h"        /* instr_time */
+#include "cdb/cdbdispatchresult.h"
 
 struct CdbDispatchResults;              /* #include "cdb/cdbdispatchresult.h" */
 struct PlanState;                       /* #include "nodes/execnodes.h" */
@@ -91,6 +92,13 @@ cdbexplain_localExecStats(struct PlanState                 *planstate,
 void
 cdbexplain_sendExecStats(struct QueryDesc *queryDesc);
 
+/*
+ * cdbexplain_getExecStats
+ *    Called by cbdb_mpp_query_state to send EXPLAIN ANALYZE
+ *    statistics when the query is still running.
+ */
+StringInfo cdbexplain_getExecStats_runtime(QueryDesc *queryDesc);
+
 /*
  * cdbexplain_recvExecStats
  *    Called by qDisp to transfer a slice's EXPLAIN ANALYZE statistics
@@ -107,7 +115,7 @@ cdbexplain_recvExecStats(struct PlanState              *planstate,
                          int                            sliceIndex,
                          struct CdbExplain_ShowStatCtx *showstatctx);
 
-/*
+/* 
  * cdbexplain_showExecStatsBegin
  *    Called by qDisp process to create a CdbExplain_ShowStatCtx structure
  *    in which to accumulate overall statistics for a query.
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index 77cb96f0cab..7b80ee7d964 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -70,6 +70,8 @@ typedef struct ExplainState
 	bool		hide_workers;	/* set if we find an invisible Gather */
 	/* state related to the current plan node */
 	ExplainWorkersState *workers_state; /* needed if parallel plan */
+	bool		runtime;		/* print intermediate state of query execution,
+								   not after completion */
 } ExplainState;
 
 /* Hook for plugins to get control in ExplainOneQuery() */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 4536df3b237..b2ea23bd433 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -17,6 +17,7 @@
 
 #include "nodes/plannodes.h"
 #include "portability/instr_time.h"
+#include "utils/metrics_utils.h"
 #include "utils/resowner.h"
 #include "storage/s_lock.h"
 
@@ -110,6 +111,13 @@ typedef struct Instrumentation
 	const char *sortSpaceType;	/* CDB: Sort space type (Memory / Disk) */
 	long		sortSpaceUsed;	/* CDB: Memory / Disk used by sort(KBytes) */
 	struct CdbExplain_NodeSummary *cdbNodeSummary;	/* stats from all qExecs */
+	/* runtime stats across all qEs */
+	instr_time	rt_starttime;		/* Start time of current iteration of node */
+	instr_time	rt_counter;
+	double		rt_firsttuple;	
+	uint64 rt_tuplecount; /* The max tuples aggregated across all qes*/
+	QueryMetricsStatus nodeStatus; /*CDB: node stauts*/
+	struct CdbExplain_NodeSummary *rt_cdbNodeSummary;	/* stats from all qExecs */
 } Instrumentation;
 
 typedef struct WorkerInstrumentation
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index 0815460c72f..4e10f2d2bdd 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -17,6 +17,8 @@
 #include "storage/backendid.h"
 
 
+#define NUM_CUSTOM_PROCSIGNALS 64
+
 /*
  * Reasons for signaling a Postgres child process (a backend or an auxiliary
  * process, like checkpointer).  We can cope with concurrent signals for different
@@ -29,6 +31,8 @@
  */
 typedef enum
 {
+	INVALID_PROCSIGNAL = -1,	/* Must be first */
+
 	PROCSIG_CATCHUP_INTERRUPT,	/* sinval catchup interrupt */
 	PROCSIG_NOTIFY_INTERRUPT,	/* listen/notify interrupt */
 	PROCSIG_PARALLEL_MESSAGE,	/* message from cooperating parallel backend */
@@ -49,6 +53,14 @@ typedef enum
 
 	PROCSIG_FAILED_LOGIN,		/* failed login */
 
+	PROCSIG_CUSTOM_1,
+	/*
+	 * PROCSIG_CUSTOM_2,
+	 * ...,
+	 * PROCSIG_CUSTOM_N-1,
+	 */
+	PROCSIG_CUSTOM_N = PROCSIG_CUSTOM_1 + NUM_CUSTOM_PROCSIGNALS - 1,
+
 	NUM_PROCSIGNALS				/* Must be last! */
 } ProcSignalReason;
 
@@ -61,6 +73,8 @@ typedef enum
 	 */
 	PROCSIGNAL_BARRIER_PLACEHOLDER = 0
 } ProcSignalBarrierType;
+/* Handler of custom process signal */
+typedef void (*ProcSignalHandler_type) (void);
 
 /*
  * prototypes for functions in procsignal.c
@@ -69,12 +83,15 @@ extern Size ProcSignalShmemSize(void);
 extern void ProcSignalShmemInit(void);
 
 extern void ProcSignalInit(int pss_idx);
+extern ProcSignalReason
+	RegisterCustomProcSignalHandler(ProcSignalHandler_type handler);
 extern int	SendProcSignal(pid_t pid, ProcSignalReason reason,
 						   BackendId backendId);
 
 extern uint64 EmitProcSignalBarrier(ProcSignalBarrierType type);
 extern void WaitForProcSignalBarrier(uint64 generation);
 extern void ProcessProcSignalBarrier(void);
+extern void CheckAndHandleCustomSignals(void);
 
 extern void procsignal_sigusr1_handler(SIGNAL_ARGS);
 
diff --git a/src/include/utils/metrics_utils.h b/src/include/utils/metrics_utils.h
index c85fab849e1..356f482561c 100644
--- a/src/include/utils/metrics_utils.h
+++ b/src/include/utils/metrics_utils.h
@@ -22,6 +22,7 @@
 
 typedef enum
 {	
+	METRICS_PLAN_NODE_UNKNOWN = 0,	
 	METRICS_PLAN_NODE_INITIALIZE = 100,
 	METRICS_PLAN_NODE_EXECUTING,
 	METRICS_PLAN_NODE_FINISHED,
diff --git a/src/test/regress/query_info_hook_test/query_info_hook_test.c b/src/test/regress/query_info_hook_test/query_info_hook_test.c
index 15facca08b4..df506116f48 100644
--- a/src/test/regress/query_info_hook_test/query_info_hook_test.c
+++ b/src/test/regress/query_info_hook_test/query_info_hook_test.c
@@ -40,6 +40,8 @@ test_hook(QueryMetricsStatus status, void* args)
 
 	switch (status)
 	{
+		case METRICS_PLAN_NODE_UNKNOWN:
+			break;
 		case METRICS_PLAN_NODE_INITIALIZE:
 			switch (((QueryDesc *)args)->plannedstmt->metricsQueryType)
 			{

From 38cb4c05161b4b3c77e0696b54389b7791f40b59 Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Fri, 20 Sep 2024 20:55:27 +0800
Subject: [PATCH 16/40] Perfmon: fix metrics error

---
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 59d0a6402f8..59b0ec38b30 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -70,6 +70,7 @@ typedef struct dbmetrics_t {
 extern int min_query_time;
 extern mmon_options_t opt;
 extern apr_queue_t* message_queue;
+int32 tmid = -1;
 
 extern void incremement_tail_bytes(apr_uint64_t bytes);
 static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid,
@@ -310,11 +311,13 @@ static apr_status_t agg_put_metrics(agg_t* agg, const gpmon_metrics_t* met)
 
 static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation)
 {
+        gpmon_qlogkey_t key = qlog->key;
+        key.tmid = tmid;
 	qdnode_t* node;
 
-	node = apr_hash_get(agg->qtab, &qlog->key, sizeof(qlog->key));
+	node = apr_hash_get(agg->qtab, &key, sizeof(key));
         if (!node) {
-                gpmon_warning(FLINE, "put query metrics can not find qdnode from qtab, queryID :%d-%d-%d", qlog->key.tmid,qlog->key.ssid,qlog->key.ccnt);
+                gpmon_warning(FLINE, "put query metrics can not find qdnode from qtab, queryID :%d-%d-%d", tmid, qlog->key.ssid,qlog->key.ccnt);
         }
 	if (node)
 	{
@@ -338,6 +341,10 @@ static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog,
 static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 				 apr_int64_t generation)
 {
+        if (tmid == -1)
+        {
+                tmid = qlog->key.tmid;
+        }
         if (qlog->dbid == gpperfmon_dbid) {
                 TR2(("agg_put_qlog:(%d.%d.%d) ignore gpperfmon sql\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt));
                 return 0;

From 329938872553cb722ea91d4dcd81d543a6d382f3 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Wed, 25 Sep 2024 16:53:03 +0800
Subject: [PATCH 17/40] Small fixes for perfmon

- Get query plan in Executor_start
- Add version check
- Remove create alter related tables.
---
 contrib/perfmon/src/gpmmon/gpmmon.c        |  2 +-
 contrib/perfmon/src/gpmmon/gpmon_agg.c     | 16 +++++-----
 contrib/perfmon/src/gpmmon/gpmondb.c       | 12 ++++----
 contrib/perfmon/src/gpmon/gpmon.c          | 23 +++++++--------
 contrib/perfmon/src/gpmon/pg_query_state.c | 34 ++++++++++++----------
 5 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
index 790f42539db..3de9897451e 100644
--- a/contrib/perfmon/src/gpmmon/gpmmon.c
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -1506,7 +1506,7 @@ int perfmon_main(Datum arg)
 		}
 	}
 
-	create_log_alert_table();
+	//create_log_alert_table();
 	gpmmon_main();
 
 	cleanup();
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 59b0ec38b30..994a68ac0b7 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -311,14 +311,16 @@ static apr_status_t agg_put_metrics(agg_t* agg, const gpmon_metrics_t* met)
 
 static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation)
 {
-        gpmon_qlogkey_t key = qlog->key;
-        key.tmid = tmid;
-	qdnode_t* node;
+	gpmon_qlogkey_t key = qlog->key;
+	key.tmid = tmid;
+	qdnode_t *node;
 
 	node = apr_hash_get(agg->qtab, &key, sizeof(key));
-        if (!node) {
-                gpmon_warning(FLINE, "put query metrics can not find qdnode from qtab, queryID :%d-%d-%d", tmid, qlog->key.ssid,qlog->key.ccnt);
-        }
+	if (!node)
+	{
+		TR2(("put query metrics can not find qdnode from qtab, queryID :%d-%d-%d \n",
+			 tmid, qlog->key.ssid, qlog->key.ccnt));
+	}
 	if (node)
 	{
 		// here update the stats for the query
@@ -327,7 +329,7 @@ static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog,
 		node->qlog.p_metrics.fd_cnt  += qlog->p_metrics.fd_cnt;
 		if (qlog->p_metrics.mem.size > node->qlog.p_metrics.mem.size)
 		{
-                        node->qlog.p_metrics.mem.size = qlog->p_metrics.mem.size;
+			node->qlog.p_metrics.mem.size = qlog->p_metrics.mem.size;
 		};
 		node->last_updated_generation = generation;
 		node->num_metrics_packets++;
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.c b/contrib/perfmon/src/gpmmon/gpmondb.c
index 199ebb13a1e..3c34c181404 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.c
+++ b/contrib/perfmon/src/gpmmon/gpmondb.c
@@ -1330,13 +1330,13 @@ apr_status_t gpdb_check_partitions(mmon_options_t *opt)
 		result = call_for_each_table_with_opt(check_partition, NULL, conn, opt);
 
 		// make sure to run check_partition even if we just got a failure from the previous call
-		apr_status_t temp_result;
-		temp_result = check_partition("log_alert", NULL, conn, opt);
+		//apr_status_t temp_result;
+		//temp_result = check_partition("log_alert", NULL, conn, opt);
 
-		// use the first error that occurred, if any
-		if (result == APR_SUCCESS) {
-			result = temp_result;
-		}
+		//// use the first error that occurred, if any
+		//if (result == APR_SUCCESS) {
+		//	result = temp_result;
+		//}
 	}
 
 	// close connection
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index e4f677162cb..04e846226e1 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -503,19 +503,16 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 					gpmon_qlog_query_error(gpmonPacket);
 					break;
 				case METRICS_PLAN_NODE_INITIALIZE:
-					if (!enable_qs_runtime())
-					{
-						query_text = get_query_text(qd);
-						plan = get_plan(qd);
-						gpmon_qlog_query_text(gpmonPacket,
-											  query_text,
-											  plan,
-											  application_name,
-											  NULL,
-											  NULL,
-											  GPMON_QLOG_STATUS_START);
-						pfree(plan);
-					}
+					query_text = get_query_text(qd);
+					plan = get_plan(qd);
+					gpmon_qlog_query_text(gpmonPacket,
+							query_text,
+							plan,
+							application_name,
+							NULL,
+							NULL,
+							GPMON_QLOG_STATUS_START);
+					pfree(plan);
 					break;
 				default:
 					break;
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.c b/contrib/perfmon/src/gpmon/pg_query_state.c
index d75c5c05fa3..569e7596c3a 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.c
+++ b/contrib/perfmon/src/gpmon/pg_query_state.c
@@ -58,7 +58,7 @@ volatile pg_atomic_uint32 *pg_qs_on;
  * the finished query node. We cached the query
  * state info at end the query. And reset it
  * when next query starts.
- * 
+ *
  * On QD, it is used to cache the whole query
  * state info. And gpmon_query_info_collect_hook
  * will send it to gpsmon. Also reset it when
@@ -601,7 +601,7 @@ pg_query_state(PG_FUNCTION_ARGS)
 		CdbPgResults cdb_pgresults = CollectQEQueryState(backendInfo);
 		AttachPeer();
 		msg =  GetRemoteBackendQueryStates(cdb_pgresults,
-									       proc,
+										   proc,
 										   verbose,
 										   costs,
 										   timing,
@@ -1080,8 +1080,8 @@ GetRemoteBackendQueryStates(CdbPgResults cdb_pgresults,
 
 	mqh = shm_mq_attach(mq, NULL, NULL);
 	sig_result = SendProcSignal(proc->pid,
-                                QueryStatePollReason,
-                                proc->backendId);
+								QueryStatePollReason,
+								proc->backendId);
 	if (sig_result == -1)
 	{
 		goto signal_error;
@@ -1115,7 +1115,7 @@ GetRemoteBackendQueryStates(CdbPgResults cdb_pgresults,
 					errmsg("error in message queue data transmitting")));
 }
 
-static shm_mq_msg* 
+static shm_mq_msg*
 receive_final_query_state(void)
 {
 	shm_mq_handle  *mqh;
@@ -1164,7 +1164,7 @@ cbdb_mpp_query_state(PG_FUNCTION_ARGS)
 		Size len;
 		if (proc == NULL)
 			continue;
-		/* 
+		/*
 		 * Wait for shm_mq detached as the mq will be reused here,
 		 * we need to wait for the mqh->sender to detached first,
 		 * then reset the mq, otherwiase it will panic
@@ -1178,7 +1178,7 @@ cbdb_mpp_query_state(PG_FUNCTION_ARGS)
 		create_shm_mq(proc, MyProc);
 		mqh = shm_mq_attach(mq, NULL, NULL);
 		/*
-		 * send signal `QueryStatePollReason` to all processes 
+		 * send signal `QueryStatePollReason` to all processes
 		 */
 		sig_result = SendProcSignal(proc->pid,
 									QueryStatePollReason,
@@ -1201,7 +1201,7 @@ cbdb_mpp_query_state(PG_FUNCTION_ARGS)
 			elog(DEBUG1, "invalid msg from %d", proc->pid);
 			goto mq_error;
 		}
-		/* 
+		/*
 		 * the query of this slice maybe closed or no query running on that backend
 		 * such as create table as, some backends insert data to the table instead
 		 * of running any plan nodes.
@@ -1262,7 +1262,7 @@ get_query_backend_info(ArrayType *array)
 
 	for (int i = 0; i < len; i++)
 	{
-		HeapTupleHeader td = DatumGetHeapTupleHeader(data[i]); 
+		HeapTupleHeader td = DatumGetHeapTupleHeader(data[i]);
 		TupleDesc   tupDesc;
 		HeapTupleData tmptup;
 		int32 pid;
@@ -1310,11 +1310,9 @@ void
 create_shm_mq(PGPROC *sender, PGPROC *receiver)
 {
 	memset(mq, 0, QUEUE_SIZE);
-    mq = shm_mq_create(mq, QUEUE_SIZE);
-    shm_mq_set_sender(mq, sender);
-    shm_mq_set_receiver(mq, receiver); /* this function notifies the
-                                        counterpart to come into data
-                                        transfer */
+	mq = shm_mq_create(mq, QUEUE_SIZE);
+	shm_mq_set_sender(mq, sender);
+	shm_mq_set_receiver(mq, receiver);
 }
 
 static bool
@@ -1399,7 +1397,7 @@ check_and_init_peer(LOCKTAG *tag, PGPROC *proc, int n_peers)
 		counterpart_user_id = GetRemoteBackendUserId(proc);
 		if (counterpart_user_id == InvalidOid)
 			ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
-							errmsg("query is busy, no response")));	
+							errmsg("query is busy, no response")));
 		if (!(superuser() || GetUserId() == counterpart_user_id))
 		{
 			UnlockShmem(tag);
@@ -1610,7 +1608,11 @@ enable_qs_runtime(void)
 {
 	if (!pg_qs_enable)
 		return false;
-	return pg_atomic_read_u32(pg_qs_on);
+	if (!pg_atomic_read_u32(pg_qs_on))
+		return false;
+	if (strcmp(GP_VERSION, "1.6.0") <= 0)
+		return false;
+	return true;
 }
 
 /* check and count the cbd_pgresults */

From 304de257398fe302ab74fad6d4be4ceddeecf972 Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Thu, 26 Sep 2024 20:11:58 +0800
Subject: [PATCH 18/40] Perfmon: fix memory metrics error

Improper use of pointers leads to incorrect collection of memory metrics.
---
 contrib/perfmon/src/gpmmon/gpmon_agg.c |   4 +-
 contrib/perfmon/src/gpsmon/gpsmon.c    | 145 +++++++++++++------------
 2 files changed, 76 insertions(+), 73 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 994a68ac0b7..ca9c197d4e8 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -333,8 +333,8 @@ static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog,
 		};
 		node->last_updated_generation = generation;
 		node->num_metrics_packets++;
-		TR2(("Query Metrics: (host %s ssid %d ccnt %d) (cpuelapsed %d cpupct %f) / %d\n",
-			 qlog->user, qlog->key.ssid, qlog->key.ccnt, (int) node->qlog.cpu_elapsed, node->qlog.p_metrics.cpu_pct,
+		TR2(("Query Metrics: (host %s ssid %d ccnt %d) (cpuelapsed %d cpupct %f memsize %lu) / %d\n",
+			 qlog->user, qlog->key.ssid, qlog->key.ccnt, (int) node->qlog.cpu_elapsed, node->qlog.p_metrics.cpu_pct, node->qlog.p_metrics.mem.size,
 			node->num_metrics_packets));
 	}
 	return 0;
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index 211656afbfd..b9899955b91 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -750,92 +750,95 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		for (hi = apr_hash_first(0, pidtab); hi; hi = apr_hash_next(hi))
 		{
 			void* vptr;
-			pidrec_t* lookup;
-                        pidrec_t* pidrec;
+			pidrec_t* queryMetric;
+			pidrec_t *pidrec;
 
 			apr_hash_this(hi, 0, 0, &vptr);
 			pidrec = vptr;
-                        if (!pidrec)
-                        {
-                                continue;
-                        }
+			if (!pidrec)
+			{
+				continue;
+			}
 
 			TR2(("%s: %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
 				FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
 				pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct, pidrec->p_metrics.mem.size));
 
 			// table is keyed on query key
-			lookup = apr_hash_get(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key));
+			queryMetric = apr_hash_get(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key));
 
-			if (lookup)
+			if (queryMetric)
 			{
 				// found other pids with same query key so add the metrics to that
 
-				lookup->cpu_elapsed += pidrec->cpu_elapsed;
-				lookup->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
-				lookup->p_metrics.fd_cnt += lookup->p_metrics.fd_cnt;
-				lookup->p_metrics.mem.resident += lookup->p_metrics.mem.resident;
-				lookup->p_metrics.mem.size += lookup->p_metrics.mem.size;
-				lookup->p_metrics.mem.share += lookup->p_metrics.mem.share;
+				queryMetric->cpu_elapsed += pidrec->cpu_elapsed;
+				queryMetric->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
+				queryMetric->p_metrics.fd_cnt += pidrec->p_metrics.fd_cnt;
+				queryMetric->p_metrics.mem.resident += pidrec->p_metrics.mem.resident;
+				queryMetric->p_metrics.mem.size += pidrec->p_metrics.mem.size;
+				queryMetric->p_metrics.mem.share += pidrec->p_metrics.mem.share;
+				TR2(("%s: increase %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
+					 FLINE, queryMetric->query_key.tmid, queryMetric->query_key.ssid, queryMetric->query_key.ccnt, queryMetric->pid,
+					 queryMetric->cpu_elapsed, queryMetric->p_metrics.cpu_pct, queryMetric->p_metrics.mem.size));
 			}
 			else
 			{
 				// insert existing pid record into table keyed by query key
-				apr_hash_set(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key), pidrec);
+				queryMetric = apr_palloc(oldpool, sizeof(pidrec_t));
+				memcpy(queryMetric, pidrec, sizeof(pidrec_t));
+				apr_hash_set(query_cpu_table, &queryMetric->query_key, sizeof(gpmon_qlogkey_t), queryMetric);
 			}
 
-
-                        // add to queryseg hash table
-                        gp_smon_to_mmon_packet_t*  rec;
-                        rec = apr_hash_get(querysegtab, &pidrec->qseg_key, sizeof(pidrec->qseg_key));
-                        if (rec)
-                        {
-                                rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
-                        }
-                        else
-                        {
-                                rec = apr_palloc(apr_hash_pool_get(querysegtab),sizeof(gp_smon_to_mmon_packet_t));
-                                CHECKMEM(rec);
-                                gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
-                                rec->u.queryseg.key = pidrec->qseg_key;
-                                rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
-                                apr_hash_set(querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
-                        }
+			// add to queryseg hash table
+			gp_smon_to_mmon_packet_t *rec;
+			rec = apr_hash_get(querysegtab, &pidrec->qseg_key, sizeof(pidrec->qseg_key));
+			if (rec)
+			{
+				rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
+			}
+			else
+			{
+				rec = apr_palloc(apr_hash_pool_get(querysegtab), sizeof(gp_smon_to_mmon_packet_t));
+				CHECKMEM(rec);
+				gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
+				rec->u.queryseg.key = pidrec->qseg_key;
+				rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
+				apr_hash_set(querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
+			}
 
 			//add to new pidtab if process is exist
 			int status = sigar_proc_state_get(gx.sigar,pidrec->pid, &state);
-                        if (status == SIGAR_OK)
-                        {
-                                apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
-                                pidrec_t* newpidrec = apr_palloc(pool, sizeof(*pidrec));
-                                memcpy(newpidrec, pidrec, sizeof(*pidrec));
-                                apr_hash_set(gx.pidtab, &newpidrec->pid, sizeof(newpidrec->pid), newpidrec);
-                                TR2(("%s: %d-%d-%d pid %d add to new pidtab \n",
-                                        FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid));
-                                continue;
-    		        }
-                        TR2(("%s: %d-%d-%d pid %d pid status %d not add to new pidtab \n",
-                                FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid, status));
+			if (status == SIGAR_OK)
+			{
+				apr_pool_t *pool = apr_hash_pool_get(gx.pidtab);
+				pidrec_t *newpidrec = apr_palloc(pool, sizeof(*pidrec));
+				memcpy(newpidrec, pidrec, sizeof(*pidrec));
+				apr_hash_set(gx.pidtab, &newpidrec->pid, sizeof(newpidrec->pid), newpidrec);
+				TR2(("%s: %d-%d-%d pid %d add to new pidtab \n",
+					 FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid));
+				continue;
+			}
+			TR2(("%s: %d-%d-%d pid %d pid status %d not add to new pidtab \n",
+				 FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid, status));
 		}
 
+		/*
+		 * QUERYSEG packets must be sent after QLOG packets so that gpmmon can
+		 * correctly populate its query_seginfo_hash.
+		 */
+		for (hi = apr_hash_first(0, querysegtab); hi; hi = apr_hash_next(hi))
+		{
+			void *vptr;
+			apr_hash_this(hi, 0, 0, &vptr);
+			ppkt = vptr;
+			if (ppkt->header.pkttype != GPMON_PKTTYPE_QUERYSEG)
+				continue;
 
-                /*
-                * QUERYSEG packets must be sent after QLOG packets so that gpmmon can
-                * correctly populate its query_seginfo_hash.
-                */
-                for (hi = apr_hash_first(0, querysegtab); hi; hi = apr_hash_next(hi))
-                {
-                        void* vptr;
-                        apr_hash_this(hi, 0, 0, &vptr);
-                        ppkt = vptr;
-                        if (ppkt->header.pkttype != GPMON_PKTTYPE_QUERYSEG)
-                                continue;
-
-                        TR2(("%s: sending magic %x, pkttype %d, %d-%d-%d\n", FLINE, ppkt->header.magic, ppkt->header.pkttype,
-                                ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt));
-                        send_smon_to_mon_pkt(sock, ppkt);
-                        count++;
-                }
+			TR2(("%s: sending magic %x, pkttype %d, %d-%d-%d\n", FLINE, ppkt->header.magic, ppkt->header.pkttype,
+				 ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt));
+			send_smon_to_mon_pkt(sock, ppkt);
+			count++;
+		}
 
 		// reset packet to 0
 		ppkt = &localPacketObject;
@@ -1534,18 +1537,18 @@ void gx_main(int port, apr_int64_t signature)
 		/* refresh pid metrics */
 		for (hi = apr_hash_first(0, gx.pidtab); hi; hi = apr_hash_next(hi))
 		{
-			void* vptr;
-			pidrec_t* rec;
-                        apr_hash_this(hi, 0, 0, &vptr);
-                        rec = vptr;
+			void *vptr;
+			pidrec_t *rec;
+			apr_hash_this(hi, 0, 0, &vptr);
+			rec = vptr;
 			if (rec)
-                        {
-                                TR2(("%s: %d-%d-%d pid %d refresh process metrics \n ",
-                                        FLINE, rec->query_key.tmid, rec->query_key.ssid, rec->query_key.ccnt, rec->pid));
+			{
+				TR2(("%s: %d-%d-%d pid %d refresh process metrics \n ",
+					 FLINE, rec->query_key.tmid, rec->query_key.ssid, rec->query_key.ccnt, rec->pid));
 				get_pid_metrics(rec->hash_key,
-                                rec->query_key.tmid,
-                                rec->query_key.ssid,
-                                rec->query_key.ccnt);
+								rec->query_key.tmid,
+								rec->query_key.ssid,
+								rec->query_key.ccnt);
 			}
 		}
 

From 21589eda9c2b8b11dc48efd7a1ec884c8a40d12f Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Mon, 11 Nov 2024 19:03:26 +0800
Subject: [PATCH 19/40] perfmon: Add test files for pg_query_state and fix some
 issues

* Add queryId in plannedstmt on master, and it will be dispatched to segments.
  It is used to check if the query of stats from segments are same with the query
  on the master. Sometimes the stats are the last query's stat running on in the same
  session or the child query's stat. User the queryId to filter them
* Make the pg_query_state to suport prepared statment
* Add pg_qs test
---
 contrib/perfmon/Makefile                      |   2 +-
 contrib/perfmon/expected/pg_qs.out            |  25 +
 contrib/perfmon/sql/pg_qs.sql                 |   4 +
 contrib/perfmon/src/gpmon/gpmon.c             |   9 +-
 contrib/perfmon/src/gpmon/pg_query_state.c    | 375 +++++++-------
 contrib/perfmon/src/gpmon/pg_query_state.h    |  15 +-
 contrib/perfmon/src/gpmon/signal_handler.c    | 178 +++----
 contrib/perfmon/src/gpmon/tests/common.py     | 169 +++++++
 .../src/gpmon/tests/pg_qs_test_runner.py      | 152 ++++++
 contrib/perfmon/src/gpmon/tests/test_cases.py | 473 ++++++++++++++++++
 src/backend/commands/explain_gp.c             |  16 +-
 src/backend/tcop/pquery.c                     |   2 +-
 12 files changed, 1127 insertions(+), 293 deletions(-)
 create mode 100644 contrib/perfmon/expected/pg_qs.out
 create mode 100644 contrib/perfmon/sql/pg_qs.sql
 create mode 100644 contrib/perfmon/src/gpmon/tests/common.py
 create mode 100644 contrib/perfmon/src/gpmon/tests/pg_qs_test_runner.py
 create mode 100644 contrib/perfmon/src/gpmon/tests/test_cases.py

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index 1226038a676..b64484917e1 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -1,7 +1,7 @@
 NAME = perfmon
 EXTVERSION = 1.1.0
 
-REGRESS = pre_run_check guc_config query extension_test post_run
+REGRESS = pre_run_check guc_config query extension_test pg_qs post_run
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/perfmon/expected/pg_qs.out b/contrib/perfmon/expected/pg_qs.out
new file mode 100644
index 00000000000..7ae22632d5e
--- /dev/null
+++ b/contrib/perfmon/expected/pg_qs.out
@@ -0,0 +1,25 @@
+\!python3 src/gpmon/tests/pg_qs_test_runner.py --port $PGPORT --database gpperfmon --user gpadmin
+setting up...
+done!
+test when two backends try to extract state of each other...
+ok!
+test statistics of simple query...
+ok!
+test when two backends compete with each other to extract state from third running backend...
+ok!
+test statistics under calling function...
+ok!
+test plan costs...
+ok!
+test timing statistics...
+ok!
+test statistics on conflicting tuples under INSERT ON CONFLICT query...
+ok!
+test skip explain analyze query...
+ok!
+test statistics of init plan...
+ok!
+test qe can cache query state when finished...
+ok!
+tearing down...
+done!
diff --git a/contrib/perfmon/sql/pg_qs.sql b/contrib/perfmon/sql/pg_qs.sql
new file mode 100644
index 00000000000..72b959b0883
--- /dev/null
+++ b/contrib/perfmon/sql/pg_qs.sql
@@ -0,0 +1,4 @@
+-- start_ignore
+\! sudo /bin/bash -c 'source /usr/local/cloudberry-db-devel/greenplum_path.sh;pip3 install psycopg2;pip3 install progressbar'
+-- end_ignore
+\!python3 src/gpmon/tests/pg_qs_test_runner.py --port $PGPORT --database gpperfmon --user gpadmin
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 04e846226e1..5fb1c309eb5 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -478,8 +478,13 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 					gpmon_qlog_query_submit(gpmonPacket);
 					break;
 				case METRICS_QUERY_DONE:
-					if (enable_qs_runtime() && CachedQueryStateInfo != NULL &&
-						CachedQueryStateInfo->gp_command_count == gp_command_count)
+					/*
+					* plannedstmt in queryDesc may have been cleaned ,
+					* so we cannot check queryId here.
+					* Only check gp_command_count
+					*/
+					if (enable_qs_runtime() && CachedQueryStateInfo != NULL
+					 && get_command_count(CachedQueryStateInfo) == gp_command_count)
 					{
 						query_text = get_query_text(qd);
 						plan = (char *)CachedQueryStateInfo->data;
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.c b/contrib/perfmon/src/gpmon/pg_query_state.c
index 569e7596c3a..cbe1129b365 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.c
+++ b/contrib/perfmon/src/gpmon/pg_query_state.c
@@ -10,13 +10,23 @@
 #include "pg_query_state.h"
 
 #include "access/htup_details.h"
+#include "access/xact.h"
 #include "catalog/pg_type.h"
-#include "funcapi.h"
+#include "cdb/cdbdispatchresult.h"
+#include "cdb/cdbdisp_query.h"
+#include "cdb/cdbexplain.h"
+#include "cdb/cdbvars.h"
 #include "executor/execParallel.h"
 #include "executor/executor.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "libpq-fe.h"
+#include "libpq-int.h"
+#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "nodes/print.h"
+#include "parser/analyze.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include "storage/ipc.h"
@@ -27,23 +37,16 @@
 #include "storage/shm_toc.h"
 #include "utils/guc.h"
 #include "utils/timestamp.h"
-#include "cdb/cdbdispatchresult.h"
-#include "cdb/cdbdisp_query.h"
-#include "cdb/cdbexplain.h"
-#include "cdb/cdbvars.h"
-#include "libpq-fe.h"
-#include "libpq/pqformat.h"
-#include "fmgr.h"
 #include "utils/lsyscache.h"
+#include "utils/portal.h"
 #include "utils/typcache.h"
-#include "libpq-int.h"
 
 #define TEXT_CSTR_CMP(text, cstr) \
 	(memcmp(VARDATA(text), (cstr), VARSIZE(text) - VARHDRSZ))
 #define HEADER_LEN sizeof(int) * 2
 
 /* GUC variables */
-bool pg_qs_enable = true;
+bool pg_qs_enable = false;
 bool pg_qs_timing = false;
 bool pg_qs_buffers = false;
 StringInfo queryStateData = NULL;
@@ -65,7 +68,8 @@ volatile pg_atomic_uint32 *pg_qs_on;
  * next query starts.
  */
 query_state_info *CachedQueryStateInfo = NULL;
-MemoryContext queryStateCtx = NULL;
+static MemoryContext queryStateCtx = NULL;
+static int qs_query_count = 0;
 
 /* Saved hook values in case of unload */
 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
@@ -81,11 +85,11 @@ static void qs_ExecutorFinish(QueryDesc *queryDesc);
 static void qs_ExecutorEnd(QueryDesc *queryDesc);
 static void clear_queryStateInfo(void);
 static void
-set_CachedQueryStateInfo(int sliceIndex, StringInfo strInfo, int gp_command_count, int queryId);
+set_CachedQueryStateInfo(int sliceIndex, StringInfo strInfo, uint64 queryId);
 static shm_mq_result receive_msg_by_parts(shm_mq_handle *mqh, Size *total,
 										  void **datap, int64 timeout, int *rc, bool nowait);
 /* functions added by cbdb */
-static List *GetRemoteBackendInfo(PGPROC *proc);
+static PG_QS_RequestResult GetRemoteBackendInfo(PGPROC *proc, List **result);
 static CdbPgResults CollectQEQueryState(List *backendInfo);
 static List *get_query_backend_info(ArrayType *array);
 static shm_mq_msg *GetRemoteBackendQueryStates(CdbPgResults cdb_pgresults,
@@ -96,8 +100,8 @@ static shm_mq_msg *GetRemoteBackendQueryStates(CdbPgResults cdb_pgresults,
 										 bool buffers,
 										 bool triggers,
 										 ExplainFormat format);
-static void qs_print_plan(qs_query *query);
-static bool filter_query_common(QueryDesc *queryDesc);
+static void qs_print_plan(QueryDesc *queryDesc);
+static bool filter_query(QueryDesc *queryDesc);
 	/* functions added by cbdb */
 
 /* important to record the info of the peer */
@@ -105,7 +109,7 @@ static void check_and_init_peer(LOCKTAG *tag, PGPROC *proc, int n_peers);
 static shm_mq_msg *receive_final_query_state(void);
 static bool wait_for_mq_ready(shm_mq *mq);
 static List *get_cdbStateCells(CdbPgResults cdb_pgresults);
-static qs_query *push_query(QueryDesc *queryDesc);
+static void push_query(QueryDesc *queryDesc);
 static void pop_query(void);
 
 /* Global variables */
@@ -304,7 +308,6 @@ init_pg_query_state(void)
 	ExecutorFinish_hook = qs_ExecutorFinish;
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = pg_qs_shmem_startup;
-
 	prev_ExecutorEnd = ExecutorEnd_hook;
 	ExecutorEnd_hook = qs_ExecutorEnd;
 }
@@ -331,8 +334,12 @@ qs_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	instr_time		starttime;
 	/* Enable per-node instrumentation */
 	if (enable_qs_runtime() && ((eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0) &&
-		Gp_role == GP_ROLE_DISPATCH && is_querystack_empty() &&
-		filter_query_common(queryDesc))
+		Gp_role == GP_ROLE_DISPATCH &&
+		/* Only watch the topest query */
+		is_querystack_empty() &&
+		filter_query(queryDesc)&&
+		/* filter the explain analyze query */
+		(queryDesc->showstatctx == NULL))
 	{
 		queryDesc->instrument_options |= INSTRUMENT_CDB;
 		queryDesc->instrument_options |= INSTRUMENT_ROWS;
@@ -344,21 +351,28 @@ qs_ExecutorStart(QueryDesc *queryDesc, int eflags)
 		INSTR_TIME_SET_CURRENT(starttime);
 		queryDesc->showstatctx = cdbexplain_showExecStatsBegin(queryDesc,
 															   starttime);
+		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
 	}
 
-	if (prev_ExecutorStart)
-		prev_ExecutorStart(queryDesc, eflags);
-	else
-		standard_ExecutorStart(queryDesc, eflags);
-	if (enable_qs_runtime() && ((eflags & EXEC_FLAG_EXPLAIN_ONLY)) == 0 &&
-		  queryDesc->totaltime == NULL && Gp_role == GP_ROLE_DISPATCH
-		  && is_querystack_empty())
+	if (queryDesc->plannedstmt->queryId == 0)
+		queryDesc->plannedstmt->queryId =
+			((uint64)gp_command_count << 32) + qs_query_count;
+	push_query(queryDesc);
+	/* push query to make pg_query_stat get the stat of initplans*/
+	PG_TRY();
 	{
-		MemoryContext oldcxt;
-		oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-		queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
-		MemoryContextSwitchTo(oldcxt);
+		if (prev_ExecutorStart)
+			prev_ExecutorStart(queryDesc, eflags);
+		else
+			standard_ExecutorStart(queryDesc, eflags);
+		pop_query();
 	}
+	PG_CATCH();
+	{
+		pop_query();
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 }
 
 /*
@@ -514,7 +528,6 @@ deserialize_stack(char *src, int stack_depth)
 		stack_frame	*frame = deserialize_stack_frame(&curr_ptr);
 		result = lappend(result, frame);
 	}
-
 	return result;
 }
 
@@ -563,8 +576,9 @@ pg_query_state(PG_FUNCTION_ARGS)
 		ExplainFormat	 format;
 		PGPROC			*proc;
 		shm_mq_msg		*msg;
-		List			*msgs;
-		List			*backendInfo;
+		List			*msgs = NIL;
+		List			*backendInfo = NIL;
+		PG_QS_RequestResult result_code;
 
 		if (!module_initialized)
 			ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
@@ -597,24 +611,30 @@ pg_query_state(PG_FUNCTION_ARGS)
 		LockShmem(&tag, PG_QS_RCV_KEY);
 		check_and_init_peer(&tag, proc, 1);
 
-		backendInfo = GetRemoteBackendInfo(proc);
-		CdbPgResults cdb_pgresults = CollectQEQueryState(backendInfo);
-		AttachPeer();
-		msg =  GetRemoteBackendQueryStates(cdb_pgresults,
-										   proc,
-										   verbose,
-										   costs,
-										   timing,
-										   buffers,
-										   triggers,
-										   format);
-
-		msgs = NIL;
-		if (msg != NULL)
+		result_code = GetRemoteBackendInfo(proc, &backendInfo);
+		if (result_code != QS_RETURNED)
 		{
-			msgs = lappend(msgs, msg );
+			msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
+			msg ->result_code = result_code;
 		}
+		else
+		{
+			CdbPgResults cdb_pgresults = CollectQEQueryState(backendInfo);
+			AttachPeer();
+			msg = GetRemoteBackendQueryStates(cdb_pgresults,
+											  proc,
+											  verbose,
+											  costs,
+											  timing,
+											  buffers,
+											  triggers,
+											  format);
 
+		}
+		if (msg != NULL)
+		{
+			msgs = lappend(msgs, msg);
+		}
 		funcctx = SRF_FIRSTCALL_INIT();
 		if (msgs == NULL || list_length(msgs) == 0)
 		{
@@ -951,8 +971,8 @@ DetachPeer(void)
 /*
  * Extracts all QE worker running by process `proc`
  */
-static List *
-GetRemoteBackendInfo(PGPROC *proc)
+static PG_QS_RequestResult
+GetRemoteBackendInfo(PGPROC *proc, List **result)
 {
 	int sig_result;
 	shm_mq_handle *mqh;
@@ -960,7 +980,6 @@ GetRemoteBackendInfo(PGPROC *proc)
 	Size msg_len;
 	backend_info *msg;
 	int i;
-	List *result = NIL;
 
 	Assert(proc && proc->backendId != InvalidBackendId);
 	Assert(BackendInfoPollReason!= INVALID_PROCSIGNAL);
@@ -974,12 +993,8 @@ GetRemoteBackendInfo(PGPROC *proc)
 	mq_receive_result = shm_mq_receive(mqh, &msg_len, (void **) &msg, false);
 	if (mq_receive_result != SHM_MQ_SUCCESS || msg == NULL || msg->reqid != reqid)
 		goto mq_error;
-	if (msg->result_code == STAT_DISABLED)
-		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("query execution statistics disabled")));
-	if (msg->result_code == QUERY_NOT_RUNNING)
-		ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("backend is not running query")));
+	if (msg->result_code != QS_RETURNED)
+		return msg->result_code;
 	int expect_len = BASE_SIZEOF_GP_BACKEND_INFO + msg->number * sizeof(gp_segment_pid);
 	if (msg_len != expect_len)
 		goto mq_error;
@@ -988,10 +1003,10 @@ GetRemoteBackendInfo(PGPROC *proc)
 	{
 		gp_segment_pid *segpid = &(msg->pids[i]);
 		elog(DEBUG1, "QE %d is running on segment %d", segpid->pid, segpid->segid);
-		result = lcons(segpid, result);
+		*result = lcons(segpid, *result);
 	}
 	shm_mq_detach(mqh);
-	return result;
+	return msg->result_code;;
 
 signal_error:
 	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
@@ -999,7 +1014,7 @@ GetRemoteBackendInfo(PGPROC *proc)
 mq_error:
 	shm_mq_detach(mqh);
 	ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("backend is not running query")));
+					errmsg("get remote backend info failed")));
 }
 
 /*
@@ -1076,7 +1091,7 @@ GetRemoteBackendQueryStates(CdbPgResults cdb_pgresults,
 	params->format = format;
 	pg_write_barrier();
 	create_shm_mq(MyProc, proc);
-	elog(DEBUG1, "CREATE shm_mq sender %d, %d, sender %d", MyProc->pid, MyProcPid, proc->pid);
+	elog(DEBUG1, "CREATE shm_mq sender %d, sender %d", MyProc->pid, proc->pid);
 
 	mqh = shm_mq_attach(mq, NULL, NULL);
 	sig_result = SendProcSignal(proc->pid,
@@ -1416,113 +1431,123 @@ check_and_init_peer(LOCKTAG *tag, PGPROC *proc, int n_peers)
 static void
 qs_ExecutorEnd(QueryDesc *queryDesc)
 {
-	if (pg_qs_enable && is_querystack_empty() && filter_running_query(queryDesc))
+	/*
+	 * such as sql: insert into table xx command,
+	 * it runs nothing on master node, only runs on segments
+	 * so it will quickly run into qs_ExecutorEnd function
+	 * and spend long time on 'cdbdisp_checkDispatchResult'.
+	 * if we don't push_query here, the pg_query_state cannot
+	 * get anything when the query is still running
+	 */
+	push_query(queryDesc);
+	PG_TRY();
 	{
-		qs_query *query = push_query(queryDesc);
-		PG_TRY();
+		if (Gp_role == GP_ROLE_EXECUTE && enable_qs_runtime() &&
+			(queryDesc->instrument_options | INSTRUMENT_ROWS) &&
+			queryDesc->planstate->instrument)
 		{
-			if (Gp_role == GP_ROLE_EXECUTE && enable_qs_runtime() &&
-				(query->queryDesc->instrument_options | INSTRUMENT_ROWS))
-			{
-				StringInfo strInfo = cdbexplain_getExecStats_runtime(queryDesc);
-				if (strInfo != NULL)
-					set_CachedQueryStateInfo(LocallyExecutingSliceIndex(queryDesc->estate), strInfo,
-											 gp_command_count, query->id);
-			} else {
-				qs_print_plan(query);
-			}
-			pop_query();
+			StringInfo strInfo = cdbexplain_getExecStats_runtime(queryDesc);
+			if (strInfo != NULL)
+				set_CachedQueryStateInfo(LocallyExecutingSliceIndex(queryDesc->estate), strInfo,
+										 queryDesc->plannedstmt->queryId);
 		}
-		PG_CATCH();
+		else if(Gp_role == GP_ROLE_DISPATCH)
 		{
-			pop_query();
-			PG_RE_THROW();
+			qs_print_plan(queryDesc);
 		}
-		PG_END_TRY();
+		pop_query();
+	}
+	PG_CATCH();
+	{
+		pop_query();
+		PG_RE_THROW();
 	}
+	PG_END_TRY();
 	if (prev_ExecutorEnd)
 		prev_ExecutorEnd(queryDesc);
 	else
 		standard_ExecutorEnd(queryDesc);
 }
 static void
-qs_print_plan(qs_query *query)
+qs_print_plan(QueryDesc *queryDesc)
 {
 	MemoryContext oldcxt;
-	QueryDesc *queryDesc = query->queryDesc;
 	double msec;
 	ErrorData *qeError = NULL;
-	if (Gp_role == GP_ROLE_DISPATCH && queryDesc->totaltime && queryDesc->showstatctx && enable_qs_runtime())
+	if (!queryDesc->totaltime || !queryDesc->showstatctx)
+		return;
+	if (!(Gp_role == GP_ROLE_DISPATCH && enable_qs_runtime()))
+		return;
+	if (!IsTransactionState())
+		return;
+
+	/* get dispatch result */
+	if (!queryDesc->estate->dispatcherState ||
+		!queryDesc->estate->dispatcherState->primaryResults)
+		return;
+	EState *estate = queryDesc->estate;
+	DispatchWaitMode waitMode = DISPATCH_WAIT_NONE;
+	if (!estate->es_got_eos)
 	{
-		if (queryDesc->estate->dispatcherState &&
-			queryDesc->estate->dispatcherState->primaryResults)
-		{
-			EState *estate = queryDesc->estate;
-			DispatchWaitMode waitMode = DISPATCH_WAIT_NONE;
-			if (!estate->es_got_eos)
-			{
-				ExecSquelchNode(queryDesc->planstate, true);
-			}
+		ExecSquelchNode(queryDesc->planstate, true);
+	}
+
+	/*
+	 * Wait for completion of all QEs.  We send a "graceful" query
+	 * finish, not cancel signal.  Since the query has succeeded,
+	 * don't confuse QEs by sending erroneous message.
+	 */
+	if (estate->cancelUnfinished)
+		waitMode = DISPATCH_WAIT_FINISH;
 
-			/*
-			 * Wait for completion of all QEs.  We send a "graceful" query
-			 * finish, not cancel signal.  Since the query has succeeded,
-			 * don't confuse QEs by sending erroneous message.
-			 */
-			if (estate->cancelUnfinished)
-				waitMode = DISPATCH_WAIT_FINISH;
+	cdbdisp_checkDispatchResult(queryDesc->estate->dispatcherState, DISPATCH_WAIT_NONE);
+	cdbdisp_getDispatchResults(queryDesc->estate->dispatcherState, &qeError);
+	if (qeError)
+		return;
+	/*
+	 * Make sure we operate in the per-query context, so any cruft will be
+	 * discarded later during ExecutorEnd.
+	 */
+	oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
 
-			cdbdisp_checkDispatchResult(queryDesc->estate->dispatcherState, DISPATCH_WAIT_NONE);
-			cdbdisp_getDispatchResults(queryDesc->estate->dispatcherState, &qeError);
-		}
-		if (!qeError)
-		{
-			/*
-			 * Make sure we operate in the per-query context, so any cruft will be
-			 * discarded later during ExecutorEnd.
-			 */
-			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
-
-			/*
-			 * Make sure stats accumulation is done.  (Note: it's okay if several
-			 * levels of hook all do this.)
-			 */
-			InstrEndLoop(queryDesc->totaltime);
-			/* Log plan if duration is exceeded. */
-			msec = queryDesc->totaltime->total;
-			if (msec >= 0)
-			{
-				ExplainState *es = NewExplainState();
-				es->analyze = true; 
-				es->verbose = false;
-				es->buffers = false;
-				es->wal = false;
-				es->timing = true;
-				es->summary = false;
-				es->format = EXPLAIN_FORMAT_JSON;
-				es->settings = true;
-				ExplainBeginOutput(es);
-				ExplainQueryText(es, queryDesc);
-				ExplainPrintPlan(es, queryDesc);
-				if (es->costs)
-					ExplainPrintJITSummary(es, queryDesc);
-				if (es->analyze)
-					ExplainPrintExecStatsEnd(es, queryDesc);
-				ExplainEndOutput(es);
-
-				/* Remove last line break */
-				if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
-					es->str->data[--es->str->len] = '\0';
-
-				es->str->data[0] = '{';
-				es->str->data[es->str->len - 1] = '}';
-
-				/* save the qd query state, set the sliceId to be 0, it will be sent to gpsmon */
-				set_CachedQueryStateInfo(0, es->str, gp_command_count, query->id);
-			}
-			MemoryContextSwitchTo(oldcxt);
-		}
+	/*
+	 * Make sure stats accumulation is done.  (Note: it's okay if several
+	 * levels of hook all do this.)
+	 */
+	InstrEndLoop(queryDesc->totaltime);
+	/* Log plan if duration is exceeded. */
+	msec = queryDesc->totaltime->total;
+	if (msec >= 0)
+	{
+		ExplainState *es = NewExplainState();
+		es->analyze = true;
+		es->verbose = false;
+		es->buffers = false;
+		es->wal = false;
+		es->timing = true;
+		es->summary = false;
+		es->format = EXPLAIN_FORMAT_JSON;
+		es->settings = true;
+		ExplainBeginOutput(es);
+		ExplainQueryText(es, queryDesc);
+		ExplainPrintPlan(es, queryDesc);
+		if (es->costs)
+			ExplainPrintJITSummary(es, queryDesc);
+		if (es->analyze)
+			ExplainPrintExecStatsEnd(es, queryDesc);
+		ExplainEndOutput(es);
+
+		/* Remove last line break */
+		if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
+			es->str->data[--es->str->len] = '\0';
+
+		es->str->data[0] = '{';
+		es->str->data[es->str->len - 1] = '}';
+
+		/* save the qd query state, set the sliceId to be 0, it will be sent to gpsmon */
+		set_CachedQueryStateInfo(0, es->str, queryDesc->plannedstmt->queryId);
 	}
+	MemoryContextSwitchTo(oldcxt);
 }
 
 static void
@@ -1542,7 +1567,7 @@ clear_queryStateInfo(void)
 }
 
 static void
-set_CachedQueryStateInfo(int sliceIndex, StringInfo strInfo, int gp_command_count, int queryId)
+set_CachedQueryStateInfo(int sliceIndex, StringInfo strInfo, uint64 queryId)
 {
 	HOLD_INTERRUPTS();
 	if (queryStateCtx == NULL)
@@ -1553,13 +1578,14 @@ set_CachedQueryStateInfo(int sliceIndex, StringInfo strInfo, int gp_command_coun
 	}
 	if (CachedQueryStateInfo != NULL)
 		clear_queryStateInfo();
-	MemoryContext queryContext = MemoryContextSwitchTo(queryStateCtx);
-	CachedQueryStateInfo = new_queryStateInfo(sliceIndex, strInfo,gp_command_count , queryId,  QS_RETURNED);
-	MemoryContextSwitchTo(queryContext);
+	MemoryContext oldContext = MemoryContextSwitchTo(queryStateCtx);
+	/* reqid is not usefull here, just set it to 0 */
+	CachedQueryStateInfo = new_queryStateInfo(sliceIndex, strInfo, 0, queryId, QS_RETURNED);
+	MemoryContextSwitchTo(oldContext);
 	RESUME_INTERRUPTS();
 }
 query_state_info*
-new_queryStateInfo(int sliceIndex, StringInfo strInfo, int reqid, int queryId, PG_QS_RequestResult result_code)
+new_queryStateInfo(int sliceIndex, StringInfo strInfo, int reqid, uint64 queryId, PG_QS_RequestResult result_code)
 {
 	/* The strInfo->data[len] is \0, we need it to be included in the length */
 	int dataLen = strInfo->len + 1;
@@ -1569,7 +1595,6 @@ new_queryStateInfo(int sliceIndex, StringInfo strInfo, int reqid, int queryId, P
 	 */
 	query_state_info *info = (query_state_info *)palloc0(dataLen + sizeof(query_state_info));
 	info->sliceIndex = sliceIndex;
-	info->gp_command_count = gp_command_count;
 	info->queryId = queryId;
 	info->length = strInfo->len + sizeof(query_state_info);
 	info->reqid = reqid;
@@ -1581,22 +1606,29 @@ new_queryStateInfo(int sliceIndex, StringInfo strInfo, int reqid, int queryId, P
 }
 
 static bool
-filter_query_common(QueryDesc *queryDesc)
+filter_query(QueryDesc *queryDesc)
 {
+	Portal	portal;
 	if (queryDesc == NULL)
 		return false;
-	if (queryDesc->extended_query)
-		return false;
+	/* check if cusor query */
+	if (queryDesc->extended_query && queryDesc->portal_name)
+	{
+		portal = GetPortalByName(queryDesc->portal_name);
+		/* cursorOptions default values is CURSOR_OPT_NO_SCROLL */
+		if (portal->cursorOptions != CURSOR_OPT_NO_SCROLL)
+			return false;
+	}
 	return (queryDesc->operation == CMD_SELECT || queryDesc->operation == CMD_DELETE ||
 			queryDesc->operation == CMD_INSERT || queryDesc->operation == CMD_UPDATE);
 }
 bool filter_running_query(QueryDesc *queryDesc)
 {
-	if (!filter_query_common(queryDesc))
+	if (!filter_query(queryDesc))
 		return false;
-	if (!queryDesc->instrument_options)
+	if (queryDesc->planstate == NULL)
 		return false;
-	if (!queryDesc->instrument_options)
+	if (queryDesc->estate == NULL)
 		return false;
 	if ((queryDesc->instrument_options & INSTRUMENT_ROWS) == 0)
 		return false;
@@ -1608,6 +1640,8 @@ enable_qs_runtime(void)
 {
 	if (!pg_qs_enable)
 		return false;
+	if (pg_qs_on == NULL)
+		return false;
 	if (!pg_atomic_read_u32(pg_qs_on))
 		return false;
 	if (strcmp(GP_VERSION, "1.6.0") <= 0)
@@ -1711,14 +1745,11 @@ query_state_resume_command(PG_FUNCTION_ARGS)
 	PG_RETURN_NULL();
 }
 
-static qs_query*
+static void
 push_query(QueryDesc *queryDesc)
 {
-	qs_query *query = (qs_query *) palloc0(sizeof(qs_query));
-	query->id = list_length(QueryDescStack) + 1;
-	query->queryDesc = queryDesc;
-	QueryDescStack = lcons(query, QueryDescStack);
-	return query;
+	qs_query_count++;
+	QueryDescStack = lcons(queryDesc, QueryDescStack);
 }
 
 static void
@@ -1733,8 +1764,16 @@ is_querystack_empty(void)
 	return list_length(QueryDescStack) == 0;
 }
 
-qs_query*
+QueryDesc*
 get_query(void)
 {
-	return QueryDescStack == NIL ? NULL : (qs_query *)llast(QueryDescStack);
+	return QueryDescStack == NIL ? NULL : (QueryDesc *)llast(QueryDescStack);
+}
+
+int
+get_command_count(query_state_info *info)
+{
+	if(info->queryId == 0)
+	return 0;
+	else return info->queryId>>32;
 }
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.h b/contrib/perfmon/src/gpmon/pg_query_state.h
index 1e9bcfc2ee8..2ea4e4ff759 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.h
+++ b/contrib/perfmon/src/gpmon/pg_query_state.h
@@ -93,8 +93,7 @@ typedef struct
 	PGPROC	*proc;
 	PG_QS_RequestResult	result_code;
 	int 	sliceIndex;
-	int 	gp_command_count;
-	int 	queryId;
+	uint64 	queryId;
 	/* data saves the CdbExplain_StatHdr */
 	char 	data[FLEXIBLE_ARRAY_MEMBER];
 } query_state_info;
@@ -113,13 +112,6 @@ typedef struct
 	ExplainFormat format;
 } pg_qs_params;
 
-typedef struct 
-{
-	QueryDesc *queryDesc;
-	int id;
-} qs_query;
-
-
 /* moved from signal_handler.c*/
 /*
  * An self-explanarory enum describing the send_msg_by_parts results
@@ -165,9 +157,10 @@ extern bool check_msg(shm_mq_result mq_receive_result, shm_mq_msg *msg, Size len
 extern void create_shm_mq(PGPROC *sender, PGPROC *receiver);
 extern bool filter_running_query(QueryDesc *queryDesc);
 extern query_state_info *new_queryStateInfo(int sliceIndex, StringInfo strInfo, int reqid,
-											int queryId,
+											uint64 queryId,
 											PG_QS_RequestResult result_code);
 extern bool wait_for_mq_detached(shm_mq_handle *mqh);
 extern bool is_querystack_empty(void);
-extern qs_query *get_query(void);
+extern QueryDesc *get_query(void);
+extern int get_command_count(query_state_info *info);
 #endif
diff --git a/contrib/perfmon/src/gpmon/signal_handler.c b/contrib/perfmon/src/gpmon/signal_handler.c
index 36a334878f3..bcd410169cb 100644
--- a/contrib/perfmon/src/gpmon/signal_handler.c
+++ b/contrib/perfmon/src/gpmon/signal_handler.c
@@ -46,9 +46,9 @@ query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg);
 static bool
 send_cdbComponents_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg);
 static bool 
-receive_QE_query_state(shm_mq_handle *mqh, List **pgresults, int queryId);
-static bool
-process_qe_query_state(CdbDispatcherState **disp_state, List *pgresults);
+receive_QE_query_state(shm_mq_handle *mqh, List **query_state_info_list);
+static CdbDispatchResults*
+process_qe_query_state(QueryDesc *queryDesc, List *query_state_info_list);
 static void 
 fill_segpid(CdbComponentDatabaseInfo *segInfo ,backend_info *msg, int *index);
 /*
@@ -180,11 +180,14 @@ SendQueryState(void)
 	int         	reqid = params->reqid;
 	MemoryContext	oldctx;
 	bool 			success = true;
+	volatile int32	savedInterruptHoldoffCount;
 
 	MemoryContext query_state_ctx = AllocSetContextCreate(TopMemoryContext,
 														  "pg_query_state",
 														  ALLOCSET_DEFAULT_SIZES);
 	oldctx = MemoryContextSwitchTo(query_state_ctx);
+	/* in elog(ERROR), InterruptHoldoffCount will be set to 0 */
+	savedInterruptHoldoffCount = InterruptHoldoffCount;
 	elog(DEBUG1, "Worker %d receives pg_query_state request from %d", shm_mq_get_sender(mq)->pid, shm_mq_get_receiver(mq)->pid);
 
 	PG_TRY();
@@ -214,10 +217,10 @@ SendQueryState(void)
 	}
 	PG_CATCH();
 	{
-		MemoryContextSwitchTo(oldctx);
 		elog(WARNING, "Failed to send query state");
 		elog_dismiss(WARNING);
 		success = false;
+		InterruptHoldoffCount = savedInterruptHoldoffCount;
 	}
 	PG_END_TRY();
 	shm_mq_detach(mqh);
@@ -243,10 +246,13 @@ SendCdbComponents(void)
 	MemoryContext oldctx;
 	bool success = true;
 	int index = 0;
+	volatile int32	savedInterruptHoldoffCount;
 	MemoryContext query_state_ctx = AllocSetContextCreate(TopMemoryContext,
 														  "pg_query_state",
 														  ALLOCSET_DEFAULT_SIZES);
 	oldctx = MemoryContextSwitchTo(query_state_ctx);
+	/* in elog(ERROR), InterruptHoldoffCount will be set to 0 */
+	savedInterruptHoldoffCount = InterruptHoldoffCount;
 	pre_check_msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
 	PG_TRY();
 	{
@@ -297,11 +303,11 @@ SendCdbComponents(void)
 	}
 	PG_CATCH();
 	{
-		MemoryContextSwitchTo(oldctx);
 		elog(WARNING, " SendCdbComponents failed");
 		elog_dismiss(WARNING);
 		success = false;
 		shm_mq_detach(mqh);
+		InterruptHoldoffCount = savedInterruptHoldoffCount;
 	}
 	PG_END_TRY();
 	DetachPeer();
@@ -323,19 +329,18 @@ QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 	instr_time starttime;
 	List *qs_stack = NIL;
 	LOCKTAG			 tag;
+	volatile int32 savedInterruptHoldoffCount;
 	bool success = true;
 	PGPROC *sender;
-	List *pgresults = NIL;
+	List *query_state_info_list = NIL;
+	disp_state = palloc0(sizeof(CdbDispatcherState));
 	shm_mq_msg *pre_check_msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
-	qs_query *query = get_query();
-	int queryId = query == NULL? -1 : query->id;
-
+	queryDesc = get_query();
 	/* first receive the results, it may be empty, such as functions only run on master */
-	if (!receive_QE_query_state(mqh, &pgresults, queryId))
-		return false;
-	queryDesc = query == NULL? NULL: query->queryDesc;
-	if (!process_qe_query_state(&disp_state, pgresults))
+	if (!receive_QE_query_state(mqh, &query_state_info_list))
 		return false;
+	disp_state->primaryResults = process_qe_query_state(queryDesc, query_state_info_list);
+
 	sender = shm_mq_get_sender(mq);
 	if (!wait_for_mq_detached(mqh))
 		return false;
@@ -365,6 +370,7 @@ QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 	 */
 	CdbDispatcherState *old_disp_state = queryDesc->estate->dispatcherState;
 	struct CdbExplain_ShowStatCtx *oldShowstatctx = queryDesc->showstatctx;
+	savedInterruptHoldoffCount = InterruptHoldoffCount;
 	PG_TRY();
 	{
 		/* initialize explain state with all config parameters */
@@ -425,6 +431,7 @@ QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 		queryDesc->showstatctx = oldShowstatctx;
 		elog_dismiss(WARNING);
 		success = false;
+		InterruptHoldoffCount = savedInterruptHoldoffCount;
 	}
 	PG_END_TRY();
 	if (!success)
@@ -471,11 +478,11 @@ QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 static bool 
 QE_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 {
-	//QueryDesc *queryDesc;
-	qs_query *query;
+	QueryDesc *queryDesc;
 	int sliceIndex;
-	query_state_info *info;
+	query_state_info *info = NULL;
 	shm_mq_msg *pre_check_msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
+	volatile int32 savedInterruptHoldoffCount;
 	bool success = true;
 	/* cannot use the send_msg_by_parts here */
 	if (!query_state_pre_check(mqh, params->reqid, pre_check_msg))
@@ -489,6 +496,7 @@ QE_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 		}
 		return true;
 	}
+	savedInterruptHoldoffCount = InterruptHoldoffCount;
 	PG_TRY();
 	{
 	
@@ -498,36 +506,39 @@ QE_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 				success = false;
 			else
 			{
-
 				int dataLen = 0;
 				info = (query_state_info *)palloc0(CachedQueryStateInfo->length);
 				info->length = CachedQueryStateInfo->length;
 				dataLen = CachedQueryStateInfo->length - sizeof(query_state_info);
 				info->sliceIndex = CachedQueryStateInfo->sliceIndex;
-				info->gp_command_count = CachedQueryStateInfo->gp_command_count;
 				memcpy(info->data, CachedQueryStateInfo->data, dataLen);
 				info->reqid = params->reqid;
 				info->proc = MyProc;
 				info->result_code = QS_RETURNED;
+				info->queryId = CachedQueryStateInfo->queryId;
 			}
 		}
-		else {
-			query = get_query();
-			Assert(query && query->queryDesc);
-			StringInfo strInfo = cdbexplain_getExecStats_runtime(query->queryDesc);
+		else
+		{
+			queryDesc = get_query();
+			Assert(queryDesc);
+			StringInfo strInfo = cdbexplain_getExecStats_runtime(queryDesc);
 			if (strInfo == NULL)
 				ereport(ERROR,
 				(errcode(ERRCODE_INTERNAL_ERROR),
 					 errmsg("cannot get runtime stats")));
-			sliceIndex = LocallyExecutingSliceIndex(query->queryDesc->estate);
-			info = new_queryStateInfo(sliceIndex, strInfo, params->reqid, query->id, QS_RETURNED);
+			sliceIndex = LocallyExecutingSliceIndex(queryDesc->estate);
+			info = new_queryStateInfo(sliceIndex, strInfo, params->reqid, queryDesc->plannedstmt->queryId, QS_RETURNED);
 		}
-		msg_by_parts_result sendResult = send_msg_by_parts(mqh, info->length, info);
-		pfree(info);
-		if (sendResult != MSG_BY_PARTS_SUCCEEDED)
+		if (info != NULL)
 		{
-			elog(DEBUG1, "pg_query_state: peer seems to have detached");
-			success = false;
+			msg_by_parts_result sendResult = send_msg_by_parts(mqh, info->length, info);
+			pfree(info);
+			if (sendResult != MSG_BY_PARTS_SUCCEEDED)
+			{
+				elog(DEBUG1, "pg_query_state: peer seems to have detached");
+				success = false;
+			}
 		}
 	}
 	PG_CATCH();
@@ -535,6 +546,7 @@ QE_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 		elog_dismiss(WARNING);
 		elog(WARNING, "failed to get QE query state");
 		success = false;
+		InterruptHoldoffCount = savedInterruptHoldoffCount;
 	}
 	PG_END_TRY();
 	if (success)
@@ -601,7 +613,7 @@ set_msg(shm_mq_msg *msg, int reqid, PG_QS_RequestResult res)
 static bool
 query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
 {
-	qs_query *query = NULL;
+	QueryDesc *queryDesc;
 	/* check if module is enabled */
 	if (!enable_qs_runtime())
 	{
@@ -615,16 +627,15 @@ query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
 	}
 
 	/* no query running on QD/QE */
-	if (list_length(QueryDescStack) != 1)
+	if (list_length(QueryDescStack) <= 0)
 	{
 		set_msg(msg, reqid, QUERY_NOT_RUNNING);
 		return false;
 	}
-	query = get_query();
-	Assert(query && query->queryDesc);
-
+	queryDesc = get_query();
+	Assert(queryDesc);
 
-	if (!filter_running_query(query->queryDesc))
+	if (!filter_running_query(queryDesc))
 	{
 		set_msg(msg, reqid, QUERY_NOT_RUNNING);
 		return false;
@@ -632,14 +643,6 @@ query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
 	return true;
 }
 
-struct slice_result
-{
-	int sliceIndex;
-	int gp_command_count;
-	int queryId;
-	PGresult *pgresult;
-};
-
 /* Receive and process query stats from QE
  *
  * Firstly get the num of results as numresults
@@ -650,7 +653,7 @@ struct slice_result
  * CdbExplain_StatHdr is saved in query_state_info.data
  */
 static bool 
-receive_QE_query_state(shm_mq_handle *mqh, List **pgresults, int queryId)
+receive_QE_query_state(shm_mq_handle *mqh, List **query_state_info_list)
 {
 	shm_mq_result mq_receive_result;
 	Size len;
@@ -672,10 +675,6 @@ receive_QE_query_state(shm_mq_handle *mqh, List **pgresults, int queryId)
 	}
 	for (int i = 0; i < *numresults; i++)
 	{
-		PGresult *pgresult = palloc(sizeof(PGresult));
-		int seg_command_count;
-		pgCdbStatCell *statcell = (pgCdbStatCell*)palloc(sizeof(pgCdbStatCell));
-
 		mq_receive_result = shm_mq_receive_with_timeout(mqh,
 														&len,
 														(void **)&seg_query_state_info,
@@ -686,84 +685,63 @@ receive_QE_query_state(shm_mq_handle *mqh, List **pgresults, int queryId)
 			/* counterpart is dead, not considering it */
 			return false;
 		}
-		/*
-		 * Check if the query on segment is the same with the current query
-		 * There is the case when the query stat are collected from the segment,
-		 * QD has started to run the next query.
-		 */
-		seg_command_count = seg_query_state_info->gp_command_count;
-		if (seg_command_count != gp_command_count || seg_query_state_info->queryId != queryId)
-		{
-			elog(DEBUG1, "receive QE query state results command id or queryId is not correct");
-			continue;
-		}
-		/* transform CdbExplain_StatHdr to pgresult */
-		statcell->data = seg_query_state_info->data;
-		statcell->len = len - sizeof(query_state_info);
-		statcell->next = NULL;
-		pgresult->cdbstats = statcell;
-		struct slice_result *res = palloc(sizeof(struct slice_result));
-		res->sliceIndex = seg_query_state_info->sliceIndex;
-		res->pgresult = pgresult;
-		res->gp_command_count = seg_command_count;
-		res->queryId = seg_query_state_info->queryId;
-		*pgresults = lappend(*pgresults, res);
-		elog(DEBUG1, "receive QE query state %d successfully", res->sliceIndex);
+		*query_state_info_list = lappend(*query_state_info_list, seg_query_state_info);
+		elog(DEBUG1, "receive QE query state slice %d, proc %d successfully", seg_query_state_info->sliceIndex, seg_query_state_info->proc->backendId);
 	}
 	return true;
 }
 
-static bool
-process_qe_query_state(CdbDispatcherState **disp_state, List *pgresults)
+static CdbDispatchResults*
+process_qe_query_state(QueryDesc *queryDesc, List *query_state_info_list)
 {
-	QueryDesc *queryDesc;
 	EState *estate;
-	CdbDispatchResults *results;
-	*disp_state = NULL;
-	qs_query *query;
-	/* give spicify error code for it*/
-	if (list_length(QueryDescStack) != 1)
-	{
-		return false;
-	}
-	query = get_query();
-	Assert(query && query->queryDesc);
-	queryDesc = query->queryDesc;
+	CdbDispatchResults *results = NULL;
+	uint64 queryId;
 	/* The query maybe has been finished */
 	if (queryDesc == NULL || queryDesc->estate == NULL)
 	{
-		return true;
+		return results;
 	}
 	estate = queryDesc->estate;
+	queryId = queryDesc->plannedstmt->queryId;
 	/* first constuct a CdbDispatchResults */
 	results = makeDispatchResults(estate->es_sliceTable);
-	if (results->resultCapacity < list_length(pgresults))
+	if (results->resultCapacity < list_length(query_state_info_list))
 	{
 		/*
 		explain analyze select test_auto_stats_in_function('delete from t_test_auto_stats_in_function',
                                    true, 't_test_auto_stats_in_function')*/
-		return true;
+		return results;
 	}
 	/* the pgresult of the same slice should be put in continous memory */
 	for(int i = 0 ; i < estate->es_sliceTable->numSlices; i++)
 	{
 		 ListCell   *c;
-		foreach(c, pgresults)
-		{
-			struct slice_result *res = (struct slice_result *)lfirst(c);
-			if(res->sliceIndex == i)
-			{
-				CdbDispatchResult *dispatchResult = cdbdisp_makeResult(results, NULL, res->sliceIndex);
-				cdbdisp_appendResult(dispatchResult, res->pgresult);
+		 foreach(c, query_state_info_list)
+		 {
+			 query_state_info *info = (query_state_info *)lfirst(c);
+			 /* if the query state's queryId not equal to current queryId, skip it */
+			 if (info->queryId != queryId)
+			 {
+				continue;
+			 }
+			 pgCdbStatCell *statcell = (pgCdbStatCell *)palloc(sizeof(pgCdbStatCell));
+			 PGresult *pgresult = palloc(sizeof(PGresult));
+			 statcell->data = info->data;
+			 statcell->len = info->length - sizeof(query_state_info);
+			 statcell->next = NULL;
+			 pgresult->cdbstats = statcell;
+			 if (info->sliceIndex == i)
+			 {
+				 CdbDispatchResult *dispatchResult = cdbdisp_makeResult(results, NULL, info->sliceIndex);
+				 cdbdisp_appendResult(dispatchResult, pgresult);
 			}
-		}
+		 }
 	}
-	*disp_state = MemoryContextAllocZero(CurrentMemoryContext, sizeof(CdbDispatcherState));
-	(*disp_state)->primaryResults = results;
-	return true;	
+	return results;
 }
 
-static void 
+static void
 fill_segpid(CdbComponentDatabaseInfo *segInfo , backend_info *msg, int* index)
 {
 
diff --git a/contrib/perfmon/src/gpmon/tests/common.py b/contrib/perfmon/src/gpmon/tests/common.py
new file mode 100644
index 00000000000..f168b2736a9
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/tests/common.py
@@ -0,0 +1,169 @@
+'''
+common.py
+Copyright (c) 2016-2024, Postgres Professional
+'''
+
+import psycopg2
+import psycopg2.extensions
+import select
+import time
+
+BACKEND_IS_IDLE_INFO = 'INFO:  state of backend is idle\n'
+BACKEND_IS_ACTIVE_INFO = 'INFO:  state of backend is active\n'
+
+def wait(conn):
+	"""wait for some event on connection to postgres"""
+	while 1:
+		state = conn.poll()
+		if state == psycopg2.extensions.POLL_OK:
+			break
+		elif state == psycopg2.extensions.POLL_WRITE:
+			select.select([], [conn.fileno()], [])
+		elif state == psycopg2.extensions.POLL_READ:
+			select.select([conn.fileno()], [], [])
+		else:
+			raise psycopg2.OperationalError("poll() returned %s" % state)
+
+def n_async_connect(config, n=1):
+	"""establish n asynchronious connections to the postgres with specified config"""
+
+	aconfig = config.copy()
+	aconfig['async'] = True
+
+	result = []
+	for _ in range(n):
+		conn = psycopg2.connect(**aconfig)
+		wait(conn)
+		result.append(conn)
+	return result
+
+def n_close(conns):
+	"""close connections to postgres"""
+
+	for conn in conns:
+		conn.close()
+
+def pg_query_state_locks(config, pid, conn, verbose=False, costs=False, timing=False, \
+								buffers=False, triggers=False, format='text'):
+	"""
+	Get query state from backend with specified pid and optional parameters.
+	Save any warning, info, notice and log data in global variable 'notices'
+	"""
+
+	curs = conn.cursor()
+	curs.callproc('pg_query_state', (pid, verbose, costs, timing, buffers, triggers, format))
+	wait(conn)
+	result = curs.fetchall()
+	notices = conn.notices[:]
+
+	return result, notices
+
+def pg_query_state(config, pid, verbose=False, costs=False, timing=False, \
+								buffers=False, triggers=False, format='text'):
+	"""
+	Get query state from backend with specified pid and optional parameters.
+	Save any warning, info, notice and log data in global variable 'notices'
+	"""
+
+	conn = psycopg2.connect(**config)
+	curs = conn.cursor()
+	curs.callproc('pg_query_state', (pid, verbose, costs, timing, buffers, triggers, format))
+	result = curs.fetchall()
+	notices = conn.notices[:]
+	conn.close()
+
+	return result, notices
+
+def onetime_query_state_locks(config, acon_query, acon_pg, query, args={}, num_workers=0):
+	"""
+	Get intermediate state of 'query' on connection 'acon_query' after number of 'steps'
+	of node executions from start of query
+	"""
+
+	curs_query = acon_query.cursor()
+	curs_pg = acon_pg.cursor()
+	set_guc(acon_query, 'optimizer', 'off')
+	curs_query.execute("select pg_advisory_lock(1) FROM gp_dist_random('gp_id') where gp_segment_id = 0")
+	curs_pg.execute("select pg_advisory_lock(2) FROM gp_dist_random('gp_id') where gp_segment_id = 0")
+	wait(acon_query)
+	wait(acon_pg)
+	curs_pg.execute("select pg_advisory_lock(1) FROM gp_dist_random('gp_id') where gp_segment_id = 0")
+	# set_guc(acon_query, 'enable_mergejoin', 'off')
+	# set_guc(acon_query, 'max_parallel_workers_per_gather', num_workers)
+	curs_query.execute(query)
+	# extract current state of query progress
+	MAX_PG_QS_RETRIES = 10
+	DELAY_BETWEEN_RETRIES = 0.3
+	pg_qs_args = {
+			'config': config,
+			'pid': acon_query.get_backend_pid(),
+			'conn': acon_pg
+			}
+	for k, v in args.items():
+		pg_qs_args[k] = v
+	n_retries = 0
+
+	wait(acon_pg)
+
+	while True:
+		result, notices = pg_query_state_locks(**pg_qs_args)
+		n_retries += 1
+		if len(result) > 0 :
+			break
+		if n_retries >= MAX_PG_QS_RETRIES:
+			# pg_query_state callings don't return any result, more likely run
+			# query has completed
+			break
+		time.sleep(DELAY_BETWEEN_RETRIES)
+
+	curs_pg.execute("select pg_advisory_unlock(2) FROM  gp_dist_random('gp_id') where gp_segment_id = 0;")
+	wait(acon_pg)
+	wait(acon_query)
+
+	set_guc(acon_query, 'enable_mergejoin', 'on')
+	curs_query.execute("select pg_advisory_unlock(2) FROM  gp_dist_random('gp_id') where gp_segment_id = 0;")
+	curs_pg.execute("select pg_advisory_unlock(1) FROM  gp_dist_random('gp_id') where gp_segment_id = 0;")
+	return result, notices
+
+def onetime_query_state(config, async_conn, query, args={}, num_workers=0):
+	"""
+	Get intermediate state of 'query' on connection 'async_conn' after number of 'steps'
+	of node executions from start of query
+	"""
+
+	acurs = async_conn.cursor()
+
+	set_guc(async_conn, 'enable_mergejoin', 'off')
+	set_guc(async_conn, 'max_parallel_workers_per_gather', num_workers)
+	set_guc(async_conn, 'optimizer', 'off')
+	acurs.execute(query)
+
+	# extract current state of query progress
+	MAX_PG_QS_RETRIES = 10
+	DELAY_BETWEEN_RETRIES = 0.1
+	pg_qs_args = {
+			'config': config,
+			'pid': async_conn.get_backend_pid()
+			}
+	for k, v in args.items():
+		pg_qs_args[k] = v
+	n_retries = 0
+	while True:
+		result, notices = pg_query_state(**pg_qs_args)
+		n_retries += 1
+		if len(result) > 0:
+			break
+		if n_retries >= MAX_PG_QS_RETRIES:
+			# pg_query_state callings don't return any result, more likely run
+			# query has completed
+			break
+		time.sleep(DELAY_BETWEEN_RETRIES)
+	wait(async_conn)
+
+	set_guc(async_conn, 'enable_mergejoin', 'on')
+	return result, notices
+
+def set_guc(async_conn, param, value):
+	acurs = async_conn.cursor()
+	acurs.execute('set %s to %s' % (param, value))
+	wait(async_conn)
diff --git a/contrib/perfmon/src/gpmon/tests/pg_qs_test_runner.py b/contrib/perfmon/src/gpmon/tests/pg_qs_test_runner.py
new file mode 100644
index 00000000000..d4195bd8fa8
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/tests/pg_qs_test_runner.py
@@ -0,0 +1,152 @@
+'''
+pg_qs_test_runner.py
+Copyright (c) 2016-2024, Postgres Professional
+'''
+
+import argparse
+import getpass
+import os
+import sys
+
+import psycopg2
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from test_cases import *
+
+class PasswordPromptAction(argparse.Action):
+	def __call__(self, parser, args, values, option_string=None):
+		password = getpass.getpass()
+		setattr(args, self.dest, password)
+
+class SetupException(Exception): pass
+class TeardownException(Exception): pass
+ 
+unlock_if_eq_1 = """
+	CREATE OR REPLACE FUNCTION unlock_if_eq_1(x integer) RETURNS integer AS $$
+		DECLARE
+		BEGIN
+			IF x = 1 THEN
+				perform pg_advisory_unlock(1);
+				perform pg_sleep(1);
+				perform pg_advisory_lock(2);
+			END IF;
+			return x;
+		END;
+	$$ LANGUAGE plpgsql
+	"""
+
+setup_cmd = [
+	'drop extension if exists perfmon cascade',
+	'drop table if exists foo cascade',
+	'drop table if exists bar cascade',
+	'drop table if exists tt cascade',
+	'create extension perfmon',
+	'create table foo(c1 integer, c2 text) distributed randomly;',
+	'create table bar(c1 integer, c2 boolean) distributed replicated;',
+	'create table tt(c1 integer, c2 text)',
+	'insert into foo select 1 from gp_dist_random(\'gp_id\')',
+	'insert into foo select i, md5(random()::text) from generate_series(1, 1000000) as i',
+	'delete from foo where gp_segment_id > 0 and c1 = 1',
+	'insert into bar select i, i%2=1 from generate_series(1, 500000) as i',
+	'insert into tt select i, md5(random()::text) from generate_series(1, 1000000) as i ',
+	'analyze foo',
+	'analyze bar',
+	'analyze tt',
+	unlock_if_eq_1,
+]
+
+teardown_cmd = [
+	'drop table foo cascade',
+	'drop table bar cascade',
+	'drop extension perfmon cascade',
+]
+
+tests = [
+	test_deadlock,
+	test_simple_query,
+	test_concurrent_access,
+	test_nested_call,
+#	test_trigger,
+	test_costs,
+#	test_buffers,
+	test_timing,
+#	test_formats,
+#	test_timing_buffers_conflicts,
+	test_insert_on_conflict,
+	test_skip_explain_analyze,
+	test_init_plan,
+	test_qe_cache_query,
+]
+
+def setup(con):
+	''' Creates pg_query_state extension, creates tables for tests, fills it with data '''
+	print('setting up...')
+	try:
+		cur = con.cursor()
+		for cmd in setup_cmd:
+			cur.execute(cmd)
+		con.commit()
+		cur.close()
+	except Exception as e:
+		raise SetupException('Setup failed: %s' % e)
+	print('done!')
+
+def teardown(con):
+	''' Drops table and extension '''
+	print('tearing down...')
+	try:
+		cur = con.cursor()
+		for cmd in teardown_cmd:
+			cur.execute(cmd)
+		con.commit()
+		cur.close()
+	except Exception as e:
+		raise TeardownException('Teardown failed: %s' % e)
+	print('done!')
+
+def main(config):
+	''' Main test function '''
+	conn_params = {
+		key:config.__dict__[key] for key in ('host', 'port', 'user', 'database', 'password')
+	}
+
+	if config.tpcds_setup:
+		print('Setup database for TPC-DS bench')
+		tpcds.setup_tpcds(conn_params)
+		print('Database is setup successfully')
+		return
+
+	if config.tpcds_run:
+		print('Starting stress test')
+		tpcds.run_tpcds(conn_params)
+		print('Stress finished successfully')
+		return
+
+	# run default tests
+	init_conn = psycopg2.connect(**conn_params)
+	setup(init_conn)
+	for i, test in enumerate(tests):
+		if test.__doc__:
+			descr = test.__doc__
+		else:
+			descr = 'test case %d' % (i+1)
+		print(("%s..." % descr))
+		sys.stdout.flush()
+		test(conn_params)
+		print('ok!')
+	teardown(init_conn)
+	init_conn.close()
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description='Query state of running backends tests')
+
+	parser.add_argument('--host', default='localhost', help='postgres server host')
+	parser.add_argument('--port', type=int, default=5432, help='postgres server port')
+	parser.add_argument('--user', dest='user', default='postgres', help='user name')
+	parser.add_argument('--database', dest='database', default='postgres', help='database name')
+	parser.add_argument('--password', dest='password', nargs=0, action=PasswordPromptAction, default='', help='password')
+	parser.add_argument('--tpc-ds-setup', dest='tpcds_setup', action='store_true', help='setup database to run TPC-DS benchmark')
+	parser.add_argument('--tpc-ds-run', dest='tpcds_run', action='store_true', help='run only stress test based on TPC-DS benchmark')
+
+	args = parser.parse_args()
+	main(args)
diff --git a/contrib/perfmon/src/gpmon/tests/test_cases.py b/contrib/perfmon/src/gpmon/tests/test_cases.py
new file mode 100644
index 00000000000..6b89b3cce5b
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/tests/test_cases.py
@@ -0,0 +1,473 @@
+'''
+test_cases.py
+Copyright (c) 2016-2024, Postgres Professional
+'''
+
+import json
+import re
+import select
+import time
+import xml.etree.ElementTree as ET
+
+import psycopg2
+import yaml
+
+import common
+
+def test_deadlock(config):
+	"""test when two backends try to extract state of each other"""
+
+	acon1, acon2 = common.n_async_connect(config, 2)
+	acurs1 = acon1.cursor()
+	acurs2 = acon2.cursor()
+
+	while True:
+		acurs1.callproc('pg_query_state', (acon2.get_backend_pid(),))
+		acurs2.callproc('pg_query_state', (acon1.get_backend_pid(),))
+
+		# listen acon1, acon2 with timeout = 3 sec to determine deadlock
+		r, w, x = select.select([acon1.fileno(), acon2.fileno()], [], [], 3)
+		assert (r or w or x), "Deadlock is happened under cross reading of query states"
+
+		common.wait(acon1)
+		common.wait(acon2)
+
+		# exit from loop if one backend could read state of execution 'pg_query_state'
+		# from other backend
+		if acurs1.fetchone() or acurs2.fetchone():
+			break
+
+	common.n_close((acon1, acon2))
+
+def test_simple_query(config):
+	"""test statistics of simple query"""
+	acon1, acon2 = common.n_async_connect(config, 2)
+	query = 'select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1'
+
+	expected = r"""Query Text: select count\(\*\) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1\(foo.c1\)=bar.c1
+Finalize Aggregate \(node status: .*\) \(actual rows=\d+, loops=1\)
+  ->  Gather Motion \d+:1  \(slice1; segments: \d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+        ->  Partial Aggregate \(node status: .*\) \(actual rows=\d+, loops=1\)
+              ->  Hash Join \(node status: Executing\) \(actual rows=\d+, loops=1\)
+                    Hash Cond: \(bar.c1 = foo.c1\)
+                    Join Filter: \(unlock_if_eq_1\(foo.c1\) = bar.c1\)
+                    ->  Seq Scan on bar \(node status: Executing\) \(actual rows=\d+, loops=1\)
+                    ->  Hash \(node status: Executing\) \(actual rows=\d+, loops=1\)
+                          ->  Seq Scan on foo \(node status: .*\) \(actual rows=\d+, loops=1\)""" 
+
+	qs, _ = common.onetime_query_state_locks(config, acon1, acon2, query)
+	assert qs[0][0] == acon1.get_backend_pid()
+	assert qs[0][1] == 0
+	assert qs[0][2] == query
+	if not re.match(expected, qs[0][3]):
+		print(qs[0][3])
+	assert re.match(expected, qs[0][3])
+	assert qs[0][4] == None
+	# assert qs[0][0] == acon.get_backend_pid() and qs[0][1] == 0 \
+	# 	and qs[0][2] == query and re.match(expected, qs[0][3]) and qs[0][4] == None
+
+	common.n_close((acon1, acon2))
+
+def test_concurrent_access(config):
+	"""test when two backends compete with each other to extract state from third running backend"""
+
+	acon1, acon2, acon3 = common.n_async_connect(config, 3)
+	acurs1, acurs2, acurs3 = acon1.cursor(), acon2.cursor(), acon3.cursor()
+	query = 'select pg_sleep(2)'
+
+	common.set_guc(acon3, 'max_parallel_workers_per_gather', 0)
+	acurs3.execute(query)
+	time.sleep(0.1)
+	acurs1.callproc('pg_query_state', (acon3.get_backend_pid(),))
+	acurs2.callproc('pg_query_state', (acon3.get_backend_pid(),))
+	common.wait(acon1)
+	common.wait(acon2)
+	common.wait(acon3)
+
+	qs1, qs2 = acurs1.fetchall(), acurs2.fetchall()
+	assert len(qs1) == len(qs2) == 1 \
+		and qs1[0][0] == qs2[0][0] == acon3.get_backend_pid() \
+		and qs1[0][1] == qs2[0][1] == 0 \
+		and qs1[0][2] == qs2[0][2] == query \
+		and len(qs1[0][3]) > 0 and len(qs2[0][3]) > 0
+
+	common.n_close((acon1, acon2, acon3))
+
+# in hashdata-lightning, only print the top level query state
+def test_nested_call(config):
+	"""test statistics under calling function"""
+
+	acon1, acon2 = common.n_async_connect(config, 2)
+	util_conn = psycopg2.connect(**config)
+	util_curs = util_conn.cursor()
+	create_function = """
+		create or replace function n_join_foo_bar() returns integer as $$
+			begin
+				return (select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1);
+			end;
+		$$ language plpgsql"""
+	drop_function = 'drop function n_join_foo_bar()'
+	call_function = 'select * from n_join_foo_bar()'
+	nested_query1 = '(select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1)'
+	expected = r"""Query Text: select \* from n_join_foo_bar\(\)
+Result \(node status: .*\) \(actual rows=\d+, loops=1\)
+  InitPlan 1 \(returns \$0\)
+    ->  Result \(node status: .*\) \(actual rows=\d+, loops=1\)"""
+	util_curs.execute(create_function)
+	util_conn.commit()
+
+	qs, notices = common.onetime_query_state_locks(config, acon1, acon2, call_function)
+
+	assert len(qs) == 1
+	assert qs[0][0] == acon1.get_backend_pid()
+	assert qs[0][1] == 0
+	assert qs[0][2] == call_function
+	if not re.match(expected, qs[0][3]):
+		print(qs[0][3])
+	assert re.match(expected, qs[0][3])
+	assert qs[0][4] == None
+	assert len(notices) == 0
+
+	util_curs.execute(drop_function)
+
+	util_conn.close()
+	common.n_close((acon1, acon2))
+
+def test_insert_on_conflict(config):
+	"""test statistics on conflicting tuples under INSERT ON CONFLICT query"""
+
+	acon, = common.n_async_connect(config)
+	util_conn = psycopg2.connect(**config)
+	util_curs = util_conn.cursor()
+	add_field_uniqueness = 'alter table tt add constraint unique_c1 unique(c1)'
+	drop_field_uniqueness = 'alter table tt drop constraint unique_c1'
+	query = 'insert into tt select i, md5(random()::text) from generate_series(1, 30000) as i on conflict do nothing'
+
+	expected = r"""Query Text: insert into tt select i, md5\(random\(\)::text\) from generate_series\(1, 30000\) as i on conflict do nothing
+Insert on tt \(node status: .*\) \(actual rows=\d+, loops=1\)
+  Conflict Resolution: NOTHING
+  Conflicting Tuples: \d+
+  ->  Redistribute Motion 1:\d+  \(slice1; segments: 1\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+        Hash Key: i.i
+        ->  Function Scan on generate_series i \(node status: .*\) \(actual rows=\d+, loops=1\)"""
+
+	util_curs.execute(add_field_uniqueness)
+	util_conn.commit()
+
+	qs, notices = common.onetime_query_state(config, acon, query)
+	assert qs[0][0] == acon.get_backend_pid()
+	assert qs[0][1] == 0
+	assert qs[0][2] == query
+	if not re.match(expected, qs[0][3]):
+		print(qs[0][3])
+	assert re.match(expected, qs[0][3]) 
+	assert qs[0][4] == None
+	assert len(notices) == 0
+
+	util_curs.execute(drop_field_uniqueness)
+	util_conn.commit()
+
+	util_conn.close()
+	common.n_close((acon,))
+
+def test_trigger(config):
+	"""test trigger statistics"""
+
+	acon, = common.n_async_connect(config)
+	acurs = acon.cursor()
+	util_conn = psycopg2.connect(**config)
+	util_curs = util_conn.cursor()
+	create_trigger_function = """
+		create or replace function unique_c1_in_foo() returns trigger as $$
+			begin
+				if new.c1 in (select c1 from foo) then
+					return null;
+				end if;
+				return new;
+			end;
+		$$ language plpgsql"""
+	create_trigger = """
+		create trigger unique_foo_c1
+			before insert or update of c1 on foo for row
+			execute procedure unique_c1_in_foo()"""
+	drop_temps = 'drop function unique_c1_in_foo() cascade'
+	query = 'insert into foo select i, md5(random()::text) from generate_series(1, 10000) as i'
+	expected_upper = r"""Insert on foo \(Current loop: actual rows=\d+, loop number=1\)
+  ->  Function Scan on generate_series i \(Current loop: actual rows=\d+, loop number=1\)"""
+	trigger_suffix = r"""Trigger unique_foo_c1: calls=\d+"""
+
+	util_curs.execute(create_trigger_function)
+	util_curs.execute(create_trigger)
+	util_conn.commit()
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'triggers': True})
+	assert qs[0][0] == acon.get_backend_pid() and qs[0][1] == 0 \
+		and qs[0][2] == query and re.match(expected_upper, qs[0][3]) \
+		and qs[0][4] == None
+	assert len(notices) == 0
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'triggers': False})
+	assert qs[0][0] == acon.get_backend_pid() and qs[0][1] == 0 \
+		and qs[0][2] == query and re.match(expected_upper, qs[0][3]) \
+		and qs[0][4] == None
+	assert len(notices) == 0
+
+	util_curs.execute(drop_temps)
+
+	util_conn.close()
+	common.n_close((acon,))
+
+def test_costs(config):
+	"""test plan costs"""
+
+	acon1, acon2 = common.n_async_connect(config, 2)
+	
+	
+	query = 'select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1'
+	expected = r"""Query Text: select count\(\*\) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1\(foo.c1\)=bar.c1
+Finalize Aggregate  \(cost=\d+.\d+..\d+.\d+ rows=\d+ width=\d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+  ->  Gather Motion \d+:1  \(slice1; segments: \d+\)  \(cost=\d+.\d+..\d+.\d+ rows=\d+ width=\d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+        ->  Partial Aggregate  \(cost=\d+.\d+..\d+.\d+ rows=\d+ width=\d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+              ->  Hash Join  \(cost=\d+.\d+..\d+.\d+ rows=\d+ width=\d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+                    Hash Cond: \(bar.c1 = foo.c1\)
+                    Join Filter: \(unlock_if_eq_1\(foo.c1\) = bar.c1\)
+                    ->  Seq Scan on bar  \(cost=\d+.\d+..\d+.\d+ rows=\d+ width=\d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+                    ->  Hash  \(cost=\d+.\d+..\d+.\d+ rows=\d+ width=\d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+                          ->  Seq Scan on foo  \(cost=\d+.\d+..\d+.\d+ rows=\d+ width=\d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)"""
+
+	qs, notices = common.onetime_query_state_locks(config, acon1, acon2, query, {'costs': True})
+
+	assert len(qs) == 1
+	if not re.match(expected, qs[0][3]):
+		print(qs[0][3])
+	assert re.match(expected, qs[0][3])
+	assert len(notices) == 0
+
+	common.n_close((acon1, acon2))
+
+def test_buffers(config):
+	"""test buffer statistics"""
+
+	acon1, acon2 = common.n_async_connect(config, 2)
+	query = 'select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1'
+	temporary = r"""Aggregate \(Current loop: actual rows=0, loop number=1\)
+  ->  Hash Join \(Current loop: actual rows=\d+, loop number=1\)
+        Hash Cond: \(foo.c1 = bar.c1\)
+        Join Filter: \(unlock_if_eq_1\(foo.c1\) = bar.c1\)"""
+	expected = temporary
+	expected_15 = temporary
+	expected += r"""
+        Buffers: shared hit=\d+, temp read=\d+ written=\d+"""
+	expected_15 += r"""
+        Buffers: shared hit=\d+, temp written=\d+"""
+	temporary = r"""
+        ->  Seq Scan on foo \(Current loop: actual rows=\d+, loop number=1\)
+              Buffers: [^\n]*
+        ->  Hash \(Current loop: actual rows=500000, loop number=1\)
+              Buckets: \d+  Batches: \d+  Memory Usage: \d+kB
+              Buffers: shared hit=\d+, temp written=\d+
+              ->  Seq Scan on bar \(Current loop: actual rows=\d+, loop number=1\)
+                    Buffers: .*"""
+	expected += temporary
+	expected_15 += temporary
+
+	common.set_guc(acon1, 'pg_query_state.enable_buffers', 'on')
+
+	qs, notices = common.onetime_query_state_locks(config, acon1, acon2, query, {'buffers': True})
+
+	assert len(qs) == 2
+	assert (re.match(expected, qs[0][3]) or re.match(expected_15, qs[0][3]))
+	assert len(notices) == 0
+
+	common.n_close((acon1, acon2))
+
+def test_timing(config):
+	"""test timing statistics"""
+
+	acon1, acon2 = common.n_async_connect(config, 2)
+	query = 'select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1'
+	expected = r"""Query Text: select count\(\*\) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1\(foo.c1\)=bar.c1
+Finalize Aggregate \(node status: .*\) \(actual time=\d+.\d+.* rows=\d+, loops=1\)
+  ->  Gather Motion \d+:1  \(slice1; segments: \d+\) \(node status: .*\) \(actual time=\d+.\d+.* rows=\d+, loops=1\)
+        ->  Partial Aggregate \(node status: .*\) \(actual time=\d+.\d+.* rows=\d+, loops=1\)
+              ->  Hash Join \(node status: .*\) \(actual time=\d+.\d+.* rows=\d+, loops=1\)
+                    Hash Cond: \(bar.c1 = foo.c1\)
+                    Join Filter: \(unlock_if_eq_1\(foo.c1\) = bar.c1\)
+                    ->  Seq Scan on bar \(node status: .*\) \(actual time=\d+.\d+.* rows=\d+, loops=1\)
+                    ->  Hash \(node status: .*\) \(actual time=\d+.\d+.* rows=\d+, loops=1\)
+                          ->  Seq Scan on foo \(node status: .*\) \(actual time=\d+.\d+.* rows=\d+, loops=1\)"""
+	common.set_guc(acon1, 'pg_query_state.enable_timing', 'on')
+
+	qs, notices = common.onetime_query_state_locks(config, acon1, acon2, query, {'timing': True})
+	if not re.match(expected, qs[0][3]):
+		print(qs[0][3])
+	assert re.match(expected, qs[0][3])
+	assert len(notices) == 0
+
+	common.n_close((acon1, acon2))
+
+def check_plan(plan):
+	assert 'Current loop' in plan
+	cur_loop = plan['Current loop']
+	assert 'Actual Loop Number' in cur_loop\
+		and 'Actual Rows' in cur_loop
+
+	if not 'Plans' in plan:
+		return
+
+	for subplan in plan['Plans']:
+		check_plan(subplan)
+
+def check_xml(root):
+	prefix = '{http://www.postgresql.org/2009/explain}'
+	for plan in root.iter(prefix + 'Plan'):
+		cur_loop = plan.find(prefix + 'Current-loop')
+		assert cur_loop != None \
+			and cur_loop.find(prefix + 'Actual-Loop-Number') != None \
+			and cur_loop.find(prefix + 'Actual-Rows') != None
+
+def test_formats(config):
+	"""test all formats of pg_query_state output"""
+
+	acon, = common.n_async_connect(config)
+	query = 'select count(*) from foo join bar on foo.c1=bar.c1'
+	expected = r"""Query Text: select count\(\*\) from foo join bar on foo.c1=bar.c1
+Finalize Aggregate \(node status: .*\) \(actual rows=\d+, loops=1\)
+  ->  Gather Motion \d+:1  \(slice1; segments: \d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
+        ->  Partial Aggregate \(node status: .*\) \(actual rows=\d+, loops=1\)
+              ->  Hash Join \(node status: .*\) \(actual rows=\d+, loops=1\)
+                    Hash Cond: \(foo.c1 = bar.c1\)
+                    ->  Seq Scan on foo \(node status: .*\) \(actual rows=\d+, loops=1\)
+                    ->  Hash \(node status: .*\) \(actual rows=\d+, loops=1\)
+                          ->  Seq Scan on bar \(node status: .*\) \(actual rows=\d+, loops=1\)"""
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'format': 'text'})
+	assert len(qs) == 1 
+	assert re.match(expected, qs[0][3])
+	assert len(notices) == 0
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'format': 'json'})
+	try:
+		js_obj = json.loads(qs[0][3])
+	except ValueError:
+		assert False, 'Invalid json format'
+	assert len(qs) == 1
+	assert len(notices) == 0
+	check_plan(js_obj['Plan'])
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'format': 'xml'})
+	assert len(qs) == 1
+	assert len(notices) == 0
+	try:
+		xml_root = ET.fromstring(qs[0][3])
+	except:
+		assert False, 'Invalid xml format'
+	check_xml(xml_root)
+
+	qs, _ = common.onetime_query_state(config, acon, query, {'format': 'yaml'})
+	try:
+		yaml_doc = yaml.load(qs[0][3], Loader=yaml.FullLoader)
+	except:
+		assert False, 'Invalid yaml format'
+	assert len(qs) == 1
+	assert len(notices) == 0
+	check_plan(yaml_doc['Plan'])
+
+	common.n_close((acon,))
+
+def test_timing_buffers_conflicts(config):
+	"""test when caller requests timing and buffers but counterpart turned off its"""
+
+	acon, = common.n_async_connect(config)
+	query = 'select count(*) from foo join bar on foo.c1=bar.c1'
+	timing_pattern = '(?:running time=\d+.\d+)|(?:actual time=\d+.\d+..\d+.\d+)'
+	buffers_pattern = 'Buffers:'
+
+	common.set_guc(acon, 'pg_query_state.enable_timing', 'off')
+	common.set_guc(acon, 'pg_query_state.enable_buffers', 'off')
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'timing': True, 'buffers': False})
+	assert len(qs) == 1 and not re.search(timing_pattern, qs[0][3])
+	assert notices == ['WARNING:  timing statistics disabled\n']
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'timing': False, 'buffers': True})
+	assert len(qs) == 1 and not re.search(buffers_pattern, qs[0][3])
+	assert notices == ['WARNING:  buffers statistics disabled\n']
+
+	qs, notices = common.onetime_query_state(config, acon, query, {'timing': True, 'buffers': True})
+	assert len(qs) == 1 and not re.search(timing_pattern, qs[0][3]) \
+						 and not re.search(buffers_pattern, qs[0][3])
+	assert len(notices) == 2 and 'WARNING:  timing statistics disabled\n' in notices \
+							 and 'WARNING:  buffers statistics disabled\n' in notices
+
+	common.n_close((acon,))
+def test_skip_explain_analyze(config):
+	"""test skip explain analyze query"""
+
+	acon1, acon2 = common.n_async_connect(config, 2)
+	query = 'explain analyze select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1'
+	common.set_guc(acon1, 'pg_query_state.enable_timing', 'off')
+	common.set_guc(acon1, 'pg_query_state.enable_buffers', 'off')
+
+	qs, notices = common.onetime_query_state_locks(config, acon1, acon2, query, {})
+	assert len(notices) > 0
+	assert notices[0] == 'INFO:  state of backend is active\n'
+	common.n_close((acon1, acon2))
+
+	# in hashdata-lightning, only print the top level query state
+def test_init_plan(config):
+	"""test statistics of init plan"""
+	acon1, acon2 = common.n_async_connect(config, 2)
+	query = 'select (select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1) as t, c1 from bar'
+	expected = r"""Query Text: select \(select count\(\*\) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1\(foo.c1\)=bar.c1\) as t, c1 from bar
+Gather Motion 1:1  \(slice1; segments: 1\) \(node status: Initialize\) \(actual rows=0, loops=1\)
+  InitPlan 1 \(returns \$0\)  \(slice2\)
+    ->  Finalize Aggregate \(node status: Initialize\) \(actual rows=0, loops=1\)
+          ->  Gather Motion \d+:1  \(slice3; segments: \d+\) \(node status: .*\) \(actual rows=0, loops=1\)
+                ->  Partial Aggregate \(node status: Initialize\) \(actual rows=0, loops=1\)
+                      ->  Hash Join \(node status: Initialize\) \(actual rows=0, loops=1\)
+                            Hash Cond: \(bar_1.c1 = foo.c1\)
+                            Join Filter: \(unlock_if_eq_1\(foo.c1\) = bar_1.c1\)
+                            ->  Seq Scan on bar bar_1 \(node status: Initialize\) \(actual rows=0, loops=1\)
+                            ->  Hash \(node status: Initialize\) \(actual rows=0, loops=1\)
+                                  ->  Seq Scan on foo \(node status: Initialize\) \(actual rows=0, loops=1\)
+  ->  Seq Scan on bar \(node status: Initialize\) \(actual rows=0, loops=1\)"""
+
+	qs,notices = common.onetime_query_state_locks(config, acon1, acon2, query, {})
+	assert qs[0][0] == acon1.get_backend_pid()
+	assert qs[0][1] == 0
+	assert qs[0][2] == query
+	if not re.match(expected, qs[0][3]):
+		print(qs[0][3])
+	assert re.match(expected, qs[0][3])
+	assert qs[0][4] == None
+	# assert qs[0][0] == acon.get_backend_pid() and qs[0][1] == 0 \
+	# 	and qs[0][2] == query and re.match(expected, qs[0][3]) and qs[0][4] == None
+
+	common.n_close((acon1, acon2))
+def test_qe_cache_query(config):
+	"""test qe can cache query state when finished"""
+	acon1, acon2 = common.n_async_connect(config, 2)
+
+	query = 'select count(*) from foo left join tt on foo.c1 = tt.c1 and tt.c1 < 10 where unlock_if_eq_1(foo.c1) < 100'
+	expected = r"""Query Text: select count\(\*\) from foo left join tt on foo.c1 = tt.c1 and tt.c1 < 10 where unlock_if_eq_1\(foo.c1\) < 100
+Finalize Aggregate \(node status: .*\) \(actual rows=0, loops=1\)
+  ->  Gather Motion \d+:1  \(slice1; segments: \d+\) \(node status: .*\) \(actual rows=0, loops=1\)
+        ->  Partial Aggregate \(node status: .*\) \(actual rows=\d+, loops=1\)
+              ->  Hash Left Join \(node status: Executing\) \(actual rows=\d+, loops=1\)
+                    Hash Cond: \(foo.c1 = tt.c1\)
+                    ->  Seq Scan on foo \(node status: Executing\) \(actual rows=\d+, loops=1\)
+                          Filter: \(unlock_if_eq_1\(c1\) < 100\)
+                    ->  Hash \(node status: .*\) \(actual rows=\d+, loops=1\)
+                          ->  Broadcast Motion \d+:\d+  \(slice2; segments: \d+\) \(node status: Finished\) \(actual rows=\d+, loops=1\)
+                                ->  Seq Scan on tt \(node status: Finished\) \(actual rows=\d+, loops=1\)
+                                      Filter: \(c1 < 10\)"""
+	time.sleep(1)
+	qs,notices = common.onetime_query_state_locks(config, acon1, acon2, query, {})
+	assert qs[0][0] == acon1.get_backend_pid()
+	assert qs[0][1] == 0
+	assert qs[0][2] == query
+	if not re.match(expected, qs[0][3]):
+		print(qs[0][3])
+	assert re.match(expected, qs[0][3])
+	assert qs[0][4] == None
diff --git a/src/backend/commands/explain_gp.c b/src/backend/commands/explain_gp.c
index ed5b2b1bd48..99260529372 100644
--- a/src/backend/commands/explain_gp.c
+++ b/src/backend/commands/explain_gp.c
@@ -1546,18 +1546,14 @@ cdbexplain_depositStatsToNode_rt(PlanState *planstate, CdbExplain_RecvStatCtx *c
 		}
 #endif
 		/* Update nodeStatus
-		 * If nodeStatus is METRICS_PLAN_NODE_UNKNOWN, then nodeStatus is rsi->nodeStatus.
-		 * If nodeStatus is METRICS_PLAN_NODE_INITIALIZE and rsi->nodeStatus is METRICS_PLAN_NODE_EXECUTING,
-		 * then nodeStatus is METRICS_PLAN_NODE_EXECUTING.
-		 * If nodeStatus is METRICS_PLAN_NODE_EXECUTING and rsi->nodeStatus is METRICS_PLAN_NODE_FINISHED,
-		 * then nodeStatus is METRICS_PLAN_NODE_EXECUTING.
+		 * METRICS_PLAN_NODE_UNKNOWN won't appear in rsi->nodeStatus
 		 */
-		if (nodeStatus == METRICS_PLAN_NODE_UNKNOWN || rsi->nodeStatus == METRICS_PLAN_NODE_EXECUTING)
+		if (nodeStatus == METRICS_PLAN_NODE_EXECUTING)
+			continue;
+		else if (nodeStatus == METRICS_PLAN_NODE_UNKNOWN || rsi->nodeStatus == METRICS_PLAN_NODE_EXECUTING)
 			nodeStatus = rsi->nodeStatus;
-		else if (nodeStatus != METRICS_PLAN_NODE_EXECUTING)
-		{
-			nodeStatus = rsi->nodeStatus < nodeStatus ? rsi->nodeStatus : nodeStatus;
-		}
+		else if (nodeStatus != rsi->nodeStatus)
+			nodeStatus =  METRICS_PLAN_NODE_EXECUTING;
 	}
 
 	/* Save per-node accumulated stats in NodeSummary. */
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 728d12c604a..dad7960b907 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -120,7 +120,7 @@ CreateQueryDesc(PlannedStmt *plannedstmt,
 	qd->showstatctx = NULL;
 
 	qd->ddesc = NULL;
-
+	qd->showstatctx = NULL;
 	/* not yet executed */
 	qd->already_executed = false;
 

From 4d086b3d250d3a4d0198fcfafb669f48f0d8feac Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 15 Nov 2024 18:52:33 +0800
Subject: [PATCH 20/40] perfmon: Process nested query

gpmon only records the toppest query if there are any
nested queries exceuted.

Add global var toppest_query in the process to record
it.
---
 contrib/perfmon/expected/pg_qs.out            |   2 +-
 contrib/perfmon/expected/query.out            |  37 +-
 contrib/perfmon/sql/query.sql                 |  18 +-
 contrib/perfmon/src/gpmon/gpmon.c             | 453 +++++++++++-------
 contrib/perfmon/src/gpmon/pg_query_state.c    |  82 ++--
 contrib/perfmon/src/gpmon/pg_query_state.h    |   5 +-
 contrib/perfmon/src/gpmon/signal_handler.c    |  10 +-
 .../perfmon/src/gpmon/tests/requirements.txt  |   3 +
 contrib/perfmon/src/gpmon/tests/test_cases.py |  44 +-
 contrib/perfmon/src/include/gpmon.h           |  19 -
 10 files changed, 404 insertions(+), 269 deletions(-)
 create mode 100644 contrib/perfmon/src/gpmon/tests/requirements.txt

diff --git a/contrib/perfmon/expected/pg_qs.out b/contrib/perfmon/expected/pg_qs.out
index 7ae22632d5e..8a0baa79ddf 100644
--- a/contrib/perfmon/expected/pg_qs.out
+++ b/contrib/perfmon/expected/pg_qs.out
@@ -17,7 +17,7 @@ test statistics on conflicting tuples under INSERT ON CONFLICT query...
 ok!
 test skip explain analyze query...
 ok!
-test statistics of init plan...
+test statistics of init plan not supported now...
 ok!
 test qe can cache query state when finished...
 ok!
diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index dc5c09cb861..28a8ddc0d1a 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -21,9 +21,22 @@ select count(*) from foo,test where foo.a=test.a;
     11
 (1 row)
 
+-- test nested query
+create or replace function n_join_foo_test() returns integer as $$
+begin
+	return (select count(*) from foo join test on foo.a=test.a);
+end;
+$$ language plpgsql;
+select * from n_join_foo_test();
+ n_join_foo_test 
+-----------------
+              11
+(1 row)
+
 DROP TABLE foo;
 DROP TABLE test;
 \c gpperfmon
+-- start_ignore
 select pg_sleep(100);
  pg_sleep 
 ----------
@@ -34,6 +47,7 @@ analyze system_history;
 analyze database_history;
 analyze diskspace_history;
 analyze queries_history;
+-- end_ignore
 select count(*) > 0 from system_now;
  ?column? 
 ----------
@@ -70,11 +84,22 @@ select count(*) > 0 from diskspace_history;
  t
 (1 row)
 
-select status, query_text, length(query_plan) > 0 from queries_history
-where ssid = :sess_id and 
-query_text = 'select count(*) from foo,test where foo.a=test.a;';
- status |                    query_text                     | ?column? 
---------+---------------------------------------------------+----------
- done   | select count(*) from foo,test where foo.a=test.a; | t
+select ccnt, status, query_text, length(query_plan) > 0 from queries_history
+where ssid = :sess_id order by ccnt;
+ ccnt | status |                            query_text                            | ?column? 
+------+--------+------------------------------------------------------------------+----------
+    2 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
+    4 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
+    8 | done   | INSERT INTO foo SELECT generate_series(0,10);                    | t
+   10 | done   | INSERT INTO test SELECT generate_series(0,10);                   | t
+   12 | done   | select count(*) from foo,test where foo.a=test.a;                | t
+   15 | done   | select * from n_join_foo_test();                                 | t
+(6 rows)
+
+SELECT COUNT(*) FROM (SELECT DISTINCT ccnt FROM queries_history
+where ssid = :sess_id) as temp;
+ count 
+-------
+     6
 (1 row)
 
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index 412ac44ea01..9195e66d0e2 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -31,15 +31,25 @@ CREATE TABLE test(a int);
 INSERT INTO foo SELECT generate_series(0,10);
 INSERT INTO test SELECT generate_series(0,10);
 select count(*) from foo,test where foo.a=test.a;
+-- test nested query
+create or replace function n_join_foo_test() returns integer as $$
+begin
+	return (select count(*) from foo join test on foo.a=test.a);
+end;
+$$ language plpgsql;
+
+select * from n_join_foo_test();
 DROP TABLE foo;
 DROP TABLE test;
 
 \c gpperfmon
+-- start_ignore
 select pg_sleep(100);
 analyze system_history;
 analyze database_history;
 analyze diskspace_history;
 analyze queries_history;
+-- end_ignore
 select count(*) > 0 from system_now;
 select count(*) > 0 from database_now;
 select count(*) > 0 from diskspace_now;
@@ -47,6 +57,8 @@ select count(*) > 0 from system_history;
 select count(*) > 0 from database_history;
 select count(*) > 0 from diskspace_history;
 
-select status, query_text, length(query_plan) > 0 from queries_history
-where ssid = :sess_id and 
-query_text = 'select count(*) from foo,test where foo.a=test.a;';
+select ccnt, status, query_text, length(query_plan) > 0 from queries_history
+where ssid = :sess_id order by ccnt;
+
+SELECT COUNT(*) FROM (SELECT DISTINCT ccnt FROM queries_history
+where ssid = :sess_id) as temp;
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 5fb1c309eb5..0fd96886ec1 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -13,6 +13,7 @@
 #include "libpq/pqsignal.h"
 #include "gpmon.h"
 
+#include "executor/execdesc.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 
@@ -53,10 +54,23 @@ static void gpmon_query_info_collect_hook(QueryMetricsStatus status, void *query
 
 static gpmon_packet_t* gpmon_qlog_packet_init();
 static void init_gpmon_hooks(void);
-static char* get_plan(QueryDesc *queryDesc);
 static char* get_query_text(QueryDesc *queryDesc);
-static int32 tstart = 0;
-static int32 tsubmit = 0;
+static bool check_query(QueryDesc *queryDesc, QueryMetricsStatus status);
+
+static void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket, QueryDesc *qd);
+static void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
+		const char *queryText,
+		const char *plan,
+		const char *appName,
+		const char *resqName,
+		const char *resqPriority,
+		int status);
+static void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket, QueryDesc *qd);
+static void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, QueryDesc *qd, bool updateRecord);
+static void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket, QueryDesc *qd);
+static void gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket, QueryDesc *qd);
+static void gpmon_send(gpmon_packet_t*);
+static void gpmon_gettmid(int32*);
 
 struct  {
     int    gxsock;
@@ -66,14 +80,32 @@ struct  {
 
 int64 gpmon_tick = 0;
 
+typedef struct
+{
+	/* data */
+	int 		query_command_count;
+	QueryDesc 	*queryDesc;
+	int32 		tstart;
+	int32 		tsubmit;
+} PerfmonQuery;
+PerfmonQuery *toppest_query;
+
+static void reset_toppest_query(QueryDesc *qd);
+static void init_toppest_query(QueryDesc *qd);
+static inline PerfmonQuery* get_toppest_perfmon_query(void);
+static inline void set_query_tsubmit(int32 tsubmit,QueryDesc *qd);
+static inline void set_query_tstart(int32 tstart,QueryDesc *qd);
+static inline int get_query_command_count(QueryDesc *qd);
+static inline int32 get_query_tsubmit(QueryDesc *qd);
+static inline int32 get_query_tstart(QueryDesc *qd);
+
 void gpmon_sig_handler(int sig);
 
-void gpmon_sig_handler(int sig) 
+void gpmon_sig_handler(int sig)
 {
 	gpmon_tick++;
 }
 
-
 void gpmon_init(void)
 {
 //	struct itimerval tv;
@@ -99,17 +131,6 @@ void gpmon_init(void)
 	}
 #endif
 
-//	/*TODO: what exactly perfmon_send_interval does? */
-//	tv.it_interval.tv_sec = perfmon_send_interval;
-//	//tv.it_interval.tv_sec = 5;
-//	tv.it_interval.tv_usec = 0;
-//	tv.it_value = tv.it_interval;
-//#ifndef WIN32
-//	if (-1 == setitimer(ITIMER_VIRTUAL, &tv, 0)) {
-//		elog(WARNING, "[perfmon]: unable to start timer (%m)");
-//	}
-//#endif 
-//
 	sock = socket(AF_INET, SOCK_DGRAM, 0);
 	if (sock == -1) {
 		elog(WARNING, "[perfmon]: cannot create socket (%m)");
@@ -167,21 +188,21 @@ void gpmon_record_update(int32 tmid, int32 ssid, int32 ccnt,
 	fclose(fp);
 }
 
-void gpmon_gettmid(int32* tmid)
+static void
+gpmon_gettmid(int32* tmid)
 {
 	Assert(init_tmid > -1);
 	*tmid = init_tmid;
 } 
 
-
-void gpmon_send(gpmon_packet_t* p)
+static void
+gpmon_send(gpmon_packet_t* p)
 {
 	if (p->magic != GPMON_MAGIC)  {
 		elog(WARNING, "[perfmon] - bad magic %x", p->magic);
 		return;
 	}
 
-
 	if (p->pkttype == GPMON_PKTTYPE_QEXEC) {
 		elog(DEBUG1,
 				"[perfmon] Perfmon Executor Packet: (tmid, ssid, ccnt, segid, pid, nid, status) = "
@@ -216,7 +237,7 @@ void gpmon_send(gpmon_packet_t* p)
  * key together in 'update_qlog'
  */
 static gpmon_packet_t*
-gpmon_qlog_packet_init()
+gpmon_qlog_packet_init(QueryDesc *qd)
 {
 	const char *username = NULL;
 	gpmon_packet_t *gpmonPacket = NULL;
@@ -225,7 +246,7 @@ gpmon_qlog_packet_init()
 
 	Assert(perfmon_enabled && Gp_role == GP_ROLE_DISPATCH);
 	Assert(gpmonPacket);
-	
+
 	gpmonPacket->magic = GPMON_MAGIC;
 	gpmonPacket->version = GPMON_PACKET_VERSION;
 	gpmonPacket->pkttype = GPMON_PKTTYPE_QLOG;
@@ -235,7 +256,6 @@ gpmon_qlog_packet_init()
 	gpmonPacket->u.qlog.key.ssid = gp_session_id;
 	gpmonPacket->u.qlog.pid = MyProcPid;
 
-
 	username = GetConfigOption("session_authorization", false, false); /* does not have to be freed */
 	/* User Id.  We use session authorization_string (so to make sense with session id) */
 	snprintf(gpmonPacket->u.qlog.user, sizeof(gpmonPacket->u.qlog.user), "%s",
@@ -243,7 +263,7 @@ gpmon_qlog_packet_init()
 	gpmonPacket->u.qlog.dbid = MyDatabaseId;
 
 	/* Fix up command count */
-	gpmonPacket->u.qlog.key.ccnt = gp_command_count;
+	gpmonPacket->u.qlog.key.ccnt = get_query_command_count(qd);
 	return gpmonPacket;
 }
 
@@ -262,14 +282,14 @@ gpmon_qexec_packet_init()
 
 	Assert(perfmon_enabled && Gp_role == GP_ROLE_EXECUTE);
 	Assert(gpmonPacket);
-	
 	gpmonPacket->magic = GPMON_MAGIC;
 	gpmonPacket->version = GPMON_PACKET_VERSION;
 	gpmonPacket->pkttype = GPMON_PKTTYPE_QEXEC;
 
 	gpmon_gettmid(&gpmonPacket->u.qexec.key.tmid);
 	gpmonPacket->u.qexec.key.ssid = gp_session_id;
-	gpmonPacket->u.qexec.key.ccnt = gp_command_count;
+	/* Better to use get_query_command_count here */
+	gpmonPacket->u.qexec.key.ccnt =  gp_command_count;
 	gpmonPacket->u.qexec.key.hash_key.segid = GpIdentity.segindex;
 	gpmonPacket->u.qexec.key.hash_key.pid = MyProcPid;
 	return gpmonPacket;
@@ -278,18 +298,17 @@ gpmon_qexec_packet_init()
 /**
  * Call this method when query is submitted.
  */
-void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket)
+static void
+gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket, QueryDesc *qd)
 {
 	struct timeval tv;
 
-
 	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
 
 	gettimeofday(&tv, 0);
-	tsubmit = tv.tv_sec;
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_SUBMIT;
-	gpmonPacket->u.qlog.tsubmit = tsubmit;
-	
+	gpmonPacket->u.qlog.tsubmit = tv.tv_sec;
+	set_query_tsubmit(tv.tv_sec, qd);
 	gpmon_send(gpmonPacket);
 }
 
@@ -311,9 +330,14 @@ static const char* gpmon_null_subst(const char* input)
  * <VALUE>\n
  * Boolean value extraByte indicates whether an additional newline is desired. This is
  * necessary because gpmon overwrites the last byte to indicate status.
+ *
+ * Have tested the speed of this function on local machine
+ * - each file is 0B, 1000 files, tabke about 50ms
+ * - each file is 102B, 1000 files, take about 70ms
+ * - each file is 57K, 1000 files, take about 240ms
  */
-
-void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
+static void
+gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
 		const char *queryText,
 		const char *plan,
 		const char *appName,
@@ -352,26 +376,25 @@ void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
 	gpmon_record_kv_with_file("resqname", resqName, false, fp);
 	gpmon_record_kv_with_file("priority", resqPriority, true, fp);
 	fprintf(fp, "%d", status);
-
 	fclose(fp);
-
 }
 
 /**
  * Call this method when query starts executing.
  */
-void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket)
+static void
+gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket, QueryDesc *qd)
 {
 	struct timeval tv;
 
 	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
 
 	gettimeofday(&tv, 0);
-	tstart = tv.tv_sec;
 	
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_START;
-	gpmonPacket->u.qlog.tsubmit = tsubmit;
-	gpmonPacket->u.qlog.tstart = tstart;
+	gpmonPacket->u.qlog.tsubmit = get_query_tsubmit(qd);
+	gpmonPacket->u.qlog.tstart = tv.tv_sec;
+	set_query_tstart(tv.tv_sec, qd);
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
 			gpmonPacket->u.qlog.key.ssid,
 			gpmonPacket->u.qlog.key.ccnt,
@@ -382,7 +405,8 @@ void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket)
 /**
  * Call this method when query finishes executing.
  */
-void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, bool updateRecord)
+static void
+gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, QueryDesc *qd, bool updateRecord)
 {
 	struct timeval tv;
 
@@ -390,8 +414,8 @@ void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, bool updateRecord)
 	gettimeofday(&tv, 0);
 	
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_DONE;
-	gpmonPacket->u.qlog.tsubmit = tsubmit;
-	gpmonPacket->u.qlog.tstart = tstart;
+	gpmonPacket->u.qlog.tsubmit = get_query_tsubmit(qd);
+	gpmonPacket->u.qlog.tstart = get_query_tstart(qd);
 	gpmonPacket->u.qlog.tfin = tv.tv_sec;
 	if (updateRecord)
 		gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
@@ -405,7 +429,8 @@ void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, bool updateRecord)
 /**
  * Call this method when query errored out.
  */
-void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket)
+static void
+gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket, QueryDesc *qd)
 {
 	struct timeval tv;
 
@@ -414,8 +439,8 @@ void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket)
 	gettimeofday(&tv, 0);
 	
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_ERROR;
-	gpmonPacket->u.qlog.tsubmit = tsubmit;
-	gpmonPacket->u.qlog.tstart = tstart;
+	gpmonPacket->u.qlog.tsubmit = get_query_tsubmit(qd);
+	gpmonPacket->u.qlog.tstart = get_query_tstart(qd);
 	gpmonPacket->u.qlog.tfin = tv.tv_sec;
 	
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
@@ -430,13 +455,13 @@ void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket)
  * gpmon_qlog_query_canceling
  *    Record that the query is being canceled.
  */
-void
-gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket)
+static void
+gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket, QueryDesc *qd)
 {
 	GPMON_QLOG_PACKET_ASSERTS(gpmonPacket);
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_CANCELING;
-	gpmonPacket->u.qlog.tsubmit = tsubmit;
-	gpmonPacket->u.qlog.tstart = tstart;
+	gpmonPacket->u.qlog.tsubmit = get_query_tsubmit(qd);
+	gpmonPacket->u.qlog.tstart = get_query_tstart(qd);
 	
 	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
 			gpmonPacket->u.qlog.key.ssid,
@@ -446,111 +471,113 @@ gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket)
 	gpmon_send(gpmonPacket);
 }
 
-static void 
+static void
 gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 {
 	char *query_text;
 	char *plan;
-	QueryDesc *qd = (QueryDesc *)queryDesc;
 	bool updateRecord = true;
-	if (perfmon_enabled && qd != NULL)
+	gpmon_packet_t *gpmonPacket = NULL;
+	if (prev_query_info_collect_hook)
+		(*prev_query_info_collect_hook)(status, queryDesc);
+
+	if (queryDesc == NULL || !perfmon_enabled)
+		return;
+
+	if (Gp_role == GP_ROLE_DISPATCH && !check_query((QueryDesc*)queryDesc, status))
+		return;
+
+	PG_TRY();
 	{
-		gpmon_packet_t *gpmonPacket = NULL;
-		PG_TRY();
-		{
 		if (Gp_role == GP_ROLE_DISPATCH)
 		{
-			gpmonPacket = gpmon_qlog_packet_init();
+			QueryDesc *qd = (QueryDesc *)queryDesc;
 			switch (status)
 			{
-				case METRICS_QUERY_START:
-					gpmon_qlog_query_start(gpmonPacket);
-					break;
-				case METRICS_QUERY_SUBMIT:
+			case METRICS_QUERY_SUBMIT:
+				init_toppest_query(qd);
+				gpmonPacket = gpmon_qlog_packet_init(qd);
+				query_text = get_query_text((QueryDesc *)queryDesc);
+				gpmon_qlog_query_text(gpmonPacket,
+									  query_text,
+									  NULL,
+									  application_name,
+									  NULL,
+									  NULL,
+									  GPMON_QLOG_STATUS_SUBMIT);
+				gpmon_qlog_query_submit(gpmonPacket, qd);
+				break;
+			case METRICS_QUERY_START:
+				gpmonPacket = gpmon_qlog_packet_init(qd);
+				gpmon_qlog_query_start(gpmonPacket, qd);
+				break;
+			case METRICS_QUERY_DONE:
+			case METRICS_INNER_QUERY_DONE:
+				gpmonPacket = gpmon_qlog_packet_init(qd);
+				/*
+				 * plannedstmt in queryDesc may have been cleaned ,
+				 * so we cannot check queryId here.
+				 * Only check gp_command_count
+				 */
+				if (enable_qs_runtime() && CachedQueryStateInfo != NULL &&
+					get_command_count(CachedQueryStateInfo) == get_query_command_count(qd))
+				{
 					query_text = get_query_text(qd);
+					plan = (char *)CachedQueryStateInfo->data;
 					gpmon_qlog_query_text(gpmonPacket,
-							query_text,
-							NULL,
-							application_name,
-							NULL,
-							NULL,
-							GPMON_QLOG_STATUS_SUBMIT);
-					gpmon_qlog_query_submit(gpmonPacket);
-					break;
-				case METRICS_QUERY_DONE:
-					/*
-					* plannedstmt in queryDesc may have been cleaned ,
-					* so we cannot check queryId here.
-					* Only check gp_command_count
-					*/
-					if (enable_qs_runtime() && CachedQueryStateInfo != NULL
-					 && get_command_count(CachedQueryStateInfo) == gp_command_count)
-					{
-						query_text = get_query_text(qd);
-						plan = (char *)CachedQueryStateInfo->data;
-						gpmon_qlog_query_text(gpmonPacket,
-											  query_text,
-											  plan,
-											  application_name,
-											  NULL,
-											  NULL,
-											  GPMON_QLOG_STATUS_DONE);
-						updateRecord = false;
-					}
-					gpmon_qlog_query_end(gpmonPacket, updateRecord);
-					break;
-					/* TODO: no GPMON_QLOG_STATUS for METRICS_QUERY_CANCELED */
-				case METRICS_QUERY_CANCELING:
-					gpmon_qlog_query_canceling(gpmonPacket);
-					break;
-				case METRICS_QUERY_ERROR:
-				case METRICS_QUERY_CANCELED:
-					gpmon_qlog_query_error(gpmonPacket);
-					break;
-				case METRICS_PLAN_NODE_INITIALIZE:
-					query_text = get_query_text(qd);
-					plan = get_plan(qd);
-					gpmon_qlog_query_text(gpmonPacket,
-							query_text,
-							plan,
-							application_name,
-							NULL,
-							NULL,
-							GPMON_QLOG_STATUS_START);
-					pfree(plan);
-					break;
-				default:
-					break;
+										  query_text,
+										  plan,
+										  application_name,
+										  NULL,
+										  NULL,
+										  GPMON_QLOG_STATUS_DONE);
+					updateRecord = false;
+				}
+				gpmon_qlog_query_end(gpmonPacket, qd, updateRecord);
+				reset_toppest_query(qd);
+				break;
+				/* TODO: no GPMON_QLOG_STATUS for METRICS_QUERY_CANCELED */
+			case METRICS_QUERY_CANCELING:
+				gpmonPacket = gpmon_qlog_packet_init(qd);
+				gpmon_qlog_query_canceling(gpmonPacket, qd);
+				break;
+			case METRICS_QUERY_ERROR:
+			case METRICS_QUERY_CANCELED:
+				gpmonPacket = gpmon_qlog_packet_init(qd);
+				gpmon_qlog_query_error(gpmonPacket, qd);
+				reset_toppest_query(qd);
+				break;
+			default:
+				break;
 			}
-			pfree(gpmonPacket);
+			if (gpmonPacket != NULL)
+				pfree(gpmonPacket);
 		}
 		else if (Gp_role == GP_ROLE_EXECUTE)
 		{
-                        gpmonPacket = gpmon_qexec_packet_init();
-                        switch (status)
-                        {
-                                case METRICS_QUERY_START:
-                                case METRICS_PLAN_NODE_EXECUTING:
-                                        gpmon_send(gpmonPacket);
-                                        break;
-                                default:
-                                        break;
-                        }
-                        pfree(gpmonPacket);
-		}
-		}
-		PG_CATCH();
-		{
-			EmitErrorReport();
-			/* swallow any error in this hook */
-			FlushErrorState();
-			if (gpmonPacket != NULL)
-				pfree(gpmonPacket);
+			gpmonPacket = gpmon_qexec_packet_init();
+			switch (status)
+			{
+			case METRICS_QUERY_START:
+			case METRICS_PLAN_NODE_EXECUTING:
+				gpmon_send(gpmonPacket);
+				break;
+			default:
+				break;
+			}
+			pfree(gpmonPacket);
 		}
-		PG_END_TRY();
+		gpmonPacket = NULL;
 	}
-	if (prev_query_info_collect_hook)
-		(*prev_query_info_collect_hook) (status, qd);
+	PG_CATCH();
+	{
+		EmitErrorReport();
+		/* swallow any error in this hook */
+		FlushErrorState();
+		if (gpmonPacket != NULL)
+			pfree(gpmonPacket);
+	}
+	PG_END_TRY();
 }
 
 static void
@@ -592,37 +619,6 @@ void
 _PG_fini(void)
 {}
 
-static
-char* get_plan(QueryDesc *queryDesc)
-{
-	char *plan;
-	ExplainState *es = NewExplainState();
-
-	es->analyze = false;
-	es->verbose = true;
-	es->buffers = true;
-	es->wal = true;
-	es->timing = true;
-	es->summary = es->analyze;
-	es->format = EXPLAIN_FORMAT_JSON;
-	es->settings = true;
-	ExplainBeginOutput(es);
-	ExplainQueryText(es, queryDesc);
-	ExplainPrintPlan(es, queryDesc);
-	ExplainEndOutput(es);
-
-	/* Remove last line break */
-	if (es->str->len > 0 && es->str->data[es->str->len - 1] == '\n')
-		es->str->data[--es->str->len] = '\0';
-
-	/* Fix JSON to output an object */
-	es->str->data[0] = '{';
-	es->str->data[es->str->len - 1] = '}';
-	plan = es->str->data;
-	pfree(es);
-	return plan;
-}
-
 static
 char* get_query_text(QueryDesc *qd)
 {
@@ -638,3 +634,126 @@ char* get_query_text(QueryDesc *qd)
 		}
 		return query_text;
 }
+
+static
+bool check_query(QueryDesc *queryDesc, QueryMetricsStatus status)
+{
+	PerfmonQuery *query;
+	switch (status)
+	{
+	case METRICS_QUERY_SUBMIT:
+		return is_querystack_empty();
+	case METRICS_QUERY_START:
+	case METRICS_QUERY_DONE:
+	case METRICS_INNER_QUERY_DONE:
+	case METRICS_QUERY_ERROR:
+	case METRICS_QUERY_CANCELING:
+	case METRICS_QUERY_CANCELED:
+		query = get_toppest_perfmon_query();
+		return query != NULL && query->queryDesc == queryDesc;
+	default:
+		return true;
+	}
+	/*
+	 * get_query returns the toppest query in the stack or NULL
+	 */
+	return false;
+}
+
+static void
+init_toppest_query(QueryDesc *qd)
+{
+	MemoryContext oldCtx = CurrentMemoryContext;
+	MemoryContextSwitchTo(TopMemoryContext);
+	if (is_querystack_empty())
+	{
+		if (toppest_query != NULL)
+		{
+			elog(WARNING, "toppest_query not reset properly %d", toppest_query->query_command_count);
+			pfree(toppest_query);
+			toppest_query = NULL;
+		}
+		PerfmonQuery *query = (PerfmonQuery *)palloc(sizeof(PerfmonQuery));
+		query->query_command_count = gp_command_count;
+		query->tstart = 0;
+		query->tsubmit = 0;
+		query->queryDesc = qd;
+		toppest_query = query;
+	}
+	MemoryContextSwitchTo(oldCtx);
+}
+
+static void
+reset_toppest_query(QueryDesc *qd)
+{
+	if (toppest_query != NULL && toppest_query->queryDesc == qd)
+	{
+		pfree(toppest_query);
+		toppest_query = NULL;
+	}
+}
+
+static inline PerfmonQuery*
+get_toppest_perfmon_query(void)
+{
+	return toppest_query;
+}
+
+static inline void
+set_query_tsubmit(int32 tsubmit, QueryDesc *qd)
+{
+	PerfmonQuery *query = get_toppest_perfmon_query();
+	if (query != NULL)
+	{
+		Assert(qd == query->queryDesc);
+		query->tsubmit = tsubmit;
+	}
+}
+
+static inline void
+set_query_tstart(int32 tstart,QueryDesc *qd)
+{
+	PerfmonQuery *query = get_toppest_perfmon_query();
+	if (query != NULL)
+	{
+		Assert(qd == query->queryDesc);
+		query->tstart = tstart;
+	}
+}
+
+static inline int
+get_query_command_count(QueryDesc *qd)
+{
+	PerfmonQuery *query = get_toppest_perfmon_query();
+	if (query != NULL)
+	{
+		Assert(qd == query->queryDesc);
+		return query->query_command_count;
+	}
+	return gp_command_count;
+}
+
+static inline int32
+get_query_tsubmit(QueryDesc *qd)
+{
+	PerfmonQuery *query = get_toppest_perfmon_query();
+	if (query != NULL)
+	{
+		Assert(qd == query->queryDesc);
+		return query->tsubmit;
+	}
+	return 0;
+}
+
+static inline int32
+get_query_tstart(QueryDesc *qd)
+{
+
+	PerfmonQuery *query = get_toppest_perfmon_query();
+	if (query != NULL)
+	{
+		Assert(qd == query->queryDesc);
+		return query->tstart;
+	}
+	return 0;
+}
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.c b/contrib/perfmon/src/gpmon/pg_query_state.c
index cbe1129b365..9db159805d4 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.c
+++ b/contrib/perfmon/src/gpmon/pg_query_state.c
@@ -357,22 +357,10 @@ qs_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	if (queryDesc->plannedstmt->queryId == 0)
 		queryDesc->plannedstmt->queryId =
 			((uint64)gp_command_count << 32) + qs_query_count;
-	push_query(queryDesc);
-	/* push query to make pg_query_stat get the stat of initplans*/
-	PG_TRY();
-	{
-		if (prev_ExecutorStart)
-			prev_ExecutorStart(queryDesc, eflags);
-		else
-			standard_ExecutorStart(queryDesc, eflags);
-		pop_query();
-	}
-	PG_CATCH();
-	{
-		pop_query();
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
+	if (prev_ExecutorStart)
+		prev_ExecutorStart(queryDesc, eflags);
+	else
+		standard_ExecutorStart(queryDesc, eflags);
 }
 
 /*
@@ -1478,32 +1466,40 @@ qs_print_plan(QueryDesc *queryDesc)
 		return;
 	if (!(Gp_role == GP_ROLE_DISPATCH && enable_qs_runtime()))
 		return;
-	if (!IsTransactionState())
+	if (!IsTransactionState() || !queryDesc->estate)
+		return;
+	if (queryDesc->estate->es_sliceTable->hasMotions &&
+			!queryDesc->estate->dispatcherState)
 		return;
 
-	/* get dispatch result */
-	if (!queryDesc->estate->dispatcherState ||
-		!queryDesc->estate->dispatcherState->primaryResults)
+	if (queryDesc->estate->es_sliceTable->hasMotions &&
+			!queryDesc->estate->dispatcherState->primaryResults)
 		return;
-	EState *estate = queryDesc->estate;
-	DispatchWaitMode waitMode = DISPATCH_WAIT_NONE;
-	if (!estate->es_got_eos)
-	{
-		ExecSquelchNode(queryDesc->planstate, true);
-	}
 
-	/*
-	 * Wait for completion of all QEs.  We send a "graceful" query
-	 * finish, not cancel signal.  Since the query has succeeded,
-	 * don't confuse QEs by sending erroneous message.
-	 */
-	if (estate->cancelUnfinished)
-		waitMode = DISPATCH_WAIT_FINISH;
+	/* get dispatch result */
+	if (queryDesc->estate->dispatcherState &&
+		queryDesc->estate->dispatcherState->primaryResults)
+		{
+			EState *estate = queryDesc->estate;
+			DispatchWaitMode waitMode = DISPATCH_WAIT_NONE;
+			if (!estate->es_got_eos)
+			{
+				ExecSquelchNode(queryDesc->planstate, true);
+			}
 
-	cdbdisp_checkDispatchResult(queryDesc->estate->dispatcherState, DISPATCH_WAIT_NONE);
-	cdbdisp_getDispatchResults(queryDesc->estate->dispatcherState, &qeError);
-	if (qeError)
-		return;
+			/*
+			 * Wait for completion of all QEs.  We send a "graceful" query
+			 * finish, not cancel signal.  Since the query has succeeded,
+			 * don't confuse QEs by sending erroneous message.
+			 */
+			if (estate->cancelUnfinished)
+				waitMode = DISPATCH_WAIT_FINISH;
+
+			cdbdisp_checkDispatchResult(queryDesc->estate->dispatcherState, DISPATCH_WAIT_NONE);
+			cdbdisp_getDispatchResults(queryDesc->estate->dispatcherState, &qeError);
+			if (qeError)
+				return;
+		}
 	/*
 	 * Make sure we operate in the per-query context, so any cruft will be
 	 * discarded later during ExecutorEnd.
@@ -1750,6 +1746,7 @@ push_query(QueryDesc *queryDesc)
 {
 	qs_query_count++;
 	QueryDescStack = lcons(queryDesc, QueryDescStack);
+
 }
 
 static void
@@ -1764,11 +1761,6 @@ is_querystack_empty(void)
 	return list_length(QueryDescStack) == 0;
 }
 
-QueryDesc*
-get_query(void)
-{
-	return QueryDescStack == NIL ? NULL : (QueryDesc *)llast(QueryDescStack);
-}
 
 int
 get_command_count(query_state_info *info)
@@ -1777,3 +1769,9 @@ get_command_count(query_state_info *info)
 	return 0;
 	else return info->queryId>>32;
 }
+
+QueryDesc*
+get_toppest_query(void)
+{
+	return QueryDescStack == NIL ? NULL : (QueryDesc*) llast(QueryDescStack);
+}
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.h b/contrib/perfmon/src/gpmon/pg_query_state.h
index 2ea4e4ff759..e73f6a38a0f 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.h
+++ b/contrib/perfmon/src/gpmon/pg_query_state.h
@@ -133,7 +133,7 @@ extern shm_mq 	*mq;
 
 extern query_state_info *CachedQueryStateInfo; 
 
-/* pg_query_setat.c */
+/* pg_query_stat.c */
 extern shm_mq_result
 shm_mq_receive_with_timeout(shm_mq_handle *mqh,
 							Size *nbytesp,
@@ -160,7 +160,8 @@ extern query_state_info *new_queryStateInfo(int sliceIndex, StringInfo strInfo,
 											uint64 queryId,
 											PG_QS_RequestResult result_code);
 extern bool wait_for_mq_detached(shm_mq_handle *mqh);
+
 extern bool is_querystack_empty(void);
-extern QueryDesc *get_query(void);
+extern QueryDesc *get_toppest_query(void);
 extern int get_command_count(query_state_info *info);
 #endif
diff --git a/contrib/perfmon/src/gpmon/signal_handler.c b/contrib/perfmon/src/gpmon/signal_handler.c
index bcd410169cb..a539153757a 100644
--- a/contrib/perfmon/src/gpmon/signal_handler.c
+++ b/contrib/perfmon/src/gpmon/signal_handler.c
@@ -335,7 +335,7 @@ QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 	List *query_state_info_list = NIL;
 	disp_state = palloc0(sizeof(CdbDispatcherState));
 	shm_mq_msg *pre_check_msg = (shm_mq_msg *)palloc0(sizeof(shm_mq_msg));
-	queryDesc = get_query();
+	queryDesc = get_toppest_query();
 	/* first receive the results, it may be empty, such as functions only run on master */
 	if (!receive_QE_query_state(mqh, &query_state_info_list))
 		return false;
@@ -520,7 +520,7 @@ QE_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 		}
 		else
 		{
-			queryDesc = get_query();
+			queryDesc = get_toppest_query();
 			Assert(queryDesc);
 			StringInfo strInfo = cdbexplain_getExecStats_runtime(queryDesc);
 			if (strInfo == NULL)
@@ -632,7 +632,7 @@ query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
 		set_msg(msg, reqid, QUERY_NOT_RUNNING);
 		return false;
 	}
-	queryDesc = get_query();
+	queryDesc = get_toppest_query();
 	Assert(queryDesc);
 
 	if (!filter_running_query(queryDesc))
@@ -652,7 +652,7 @@ query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
  *
  * CdbExplain_StatHdr is saved in query_state_info.data
  */
-static bool 
+static bool
 receive_QE_query_state(shm_mq_handle *mqh, List **query_state_info_list)
 {
 	shm_mq_result mq_receive_result;
@@ -703,6 +703,8 @@ process_qe_query_state(QueryDesc *queryDesc, List *query_state_info_list)
 		return results;
 	}
 	estate = queryDesc->estate;
+	if (estate->es_query_cxt == NULL)
+		return results;
 	queryId = queryDesc->plannedstmt->queryId;
 	/* first constuct a CdbDispatchResults */
 	results = makeDispatchResults(estate->es_sliceTable);
diff --git a/contrib/perfmon/src/gpmon/tests/requirements.txt b/contrib/perfmon/src/gpmon/tests/requirements.txt
new file mode 100644
index 00000000000..ff6b4f49dd9
--- /dev/null
+++ b/contrib/perfmon/src/gpmon/tests/requirements.txt
@@ -0,0 +1,3 @@
+PyYAML
+psycopg2
+progressbar2
diff --git a/contrib/perfmon/src/gpmon/tests/test_cases.py b/contrib/perfmon/src/gpmon/tests/test_cases.py
index 6b89b3cce5b..5b7ab676160 100644
--- a/contrib/perfmon/src/gpmon/tests/test_cases.py
+++ b/contrib/perfmon/src/gpmon/tests/test_cases.py
@@ -416,31 +416,25 @@ def test_skip_explain_analyze(config):
 
 	# in hashdata-lightning, only print the top level query state
 def test_init_plan(config):
-	"""test statistics of init plan"""
+	"""test statistics of init plan not supported now"""
 	acon1, acon2 = common.n_async_connect(config, 2)
 	query = 'select (select count(*) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1(foo.c1)=bar.c1) as t, c1 from bar'
-	expected = r"""Query Text: select \(select count\(\*\) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1\(foo.c1\)=bar.c1\) as t, c1 from bar
-Gather Motion 1:1  \(slice1; segments: 1\) \(node status: Initialize\) \(actual rows=0, loops=1\)
-  InitPlan 1 \(returns \$0\)  \(slice2\)
-    ->  Finalize Aggregate \(node status: Initialize\) \(actual rows=0, loops=1\)
-          ->  Gather Motion \d+:1  \(slice3; segments: \d+\) \(node status: .*\) \(actual rows=0, loops=1\)
-                ->  Partial Aggregate \(node status: Initialize\) \(actual rows=0, loops=1\)
-                      ->  Hash Join \(node status: Initialize\) \(actual rows=0, loops=1\)
-                            Hash Cond: \(bar_1.c1 = foo.c1\)
-                            Join Filter: \(unlock_if_eq_1\(foo.c1\) = bar_1.c1\)
-                            ->  Seq Scan on bar bar_1 \(node status: Initialize\) \(actual rows=0, loops=1\)
-                            ->  Hash \(node status: Initialize\) \(actual rows=0, loops=1\)
-                                  ->  Seq Scan on foo \(node status: Initialize\) \(actual rows=0, loops=1\)
-  ->  Seq Scan on bar \(node status: Initialize\) \(actual rows=0, loops=1\)"""
-
-	qs,notices = common.onetime_query_state_locks(config, acon1, acon2, query, {})
-	assert qs[0][0] == acon1.get_backend_pid()
-	assert qs[0][1] == 0
-	assert qs[0][2] == query
-	if not re.match(expected, qs[0][3]):
-		print(qs[0][3])
-	assert re.match(expected, qs[0][3])
-	assert qs[0][4] == None
+	#expected = r"""Query Text: select \(select count\(\*\) from foo join bar on foo.c1=bar.c1 and unlock_if_eq_1\(foo.c1\)=bar.c1\) as t, c1 from bar
+ #Gather Motion 1:1  \(slice1; segments: 1\) \(node status: Initialize\) \(actual rows=0, loops=1\)
+ # InitPlan 1 \(returns \$0\)  \(slice2\)
+ #   ->  Finalize Aggregate \(node status: Initialize\) \(actual rows=0, loops=1\)
+ #         ->  Gather Motion \d+:1  \(slice3; segments: \d+\) \(node status: .*\) \(actual rows=0, loops=1\)
+ #               ->  Partial Aggregate \(node status: Initialize\) \(actual rows=0, loops=1\)
+ #                     ->  Hash Join \(node status: Initialize\) \(actual rows=0, loops=1\)
+ #                           Hash Cond: \(bar_1.c1 = foo.c1\)
+ #                           Join Filter: \(unlock_if_eq_1\(foo.c1\) = bar_1.c1\)
+ #                           ->  Seq Scan on bar bar_1 \(node status: Initialize\) \(actual rows=0, loops=1\)
+ #                           ->  Hash \(node status: Initialize\) \(actual rows=0, loops=1\)
+ #                                 ->  Seq Scan on foo \(node status: Initialize\) \(actual rows=0, loops=1\)
+ # ->  Seq Scan on bar \(node status: Initialize\) \(actual rows=0, loops=1\)"""
+
+	_,notices = common.onetime_query_state_locks(config, acon1, acon2, query, {})
+	assert notices[0] == 'INFO:  state of backend is active\n'
 	# assert qs[0][0] == acon.get_backend_pid() and qs[0][1] == 0 \
 	# 	and qs[0][2] == query and re.match(expected, qs[0][3]) and qs[0][4] == None
 
@@ -451,8 +445,8 @@ def test_qe_cache_query(config):
 
 	query = 'select count(*) from foo left join tt on foo.c1 = tt.c1 and tt.c1 < 10 where unlock_if_eq_1(foo.c1) < 100'
 	expected = r"""Query Text: select count\(\*\) from foo left join tt on foo.c1 = tt.c1 and tt.c1 < 10 where unlock_if_eq_1\(foo.c1\) < 100
-Finalize Aggregate \(node status: .*\) \(actual rows=0, loops=1\)
-  ->  Gather Motion \d+:1  \(slice1; segments: \d+\) \(node status: .*\) \(actual rows=0, loops=1\)
+Finalize Aggregate \(node status: .*\) \(actual rows=\d+, loops=1\)
+  ->  Gather Motion \d+:1  \(slice1; segments: \d+\) \(node status: .*\) \(actual rows=\d+, loops=1\)
         ->  Partial Aggregate \(node status: .*\) \(actual rows=\d+, loops=1\)
               ->  Hash Left Join \(node status: Executing\) \(actual rows=\d+, loops=1\)
                     Hash Cond: \(foo.c1 = tt.c1\)
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index beae07218cf..a1c5ead3564 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -41,25 +41,6 @@ for example SCHEMA.RELATION\0
 #define SCAN_REL_NAME_BUF_SIZE (NAMEDATALEN*2)
 
 
-/* ------------------------------------------------------------------
-         INTERFACE
-   ------------------------------------------------------------------ */
-
-extern void gpmon_qlog_query_submit(gpmon_packet_t *gpmonPacket);
-extern void gpmon_qlog_query_text(const gpmon_packet_t *gpmonPacket,
-		const char *queryText,
-		const char *plan,
-		const char *appName,
-		const char *resqName,
-		const char *resqPriority,
-		int status);
-extern void gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket);
-extern void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, bool updateRecord);
-extern void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket);
-extern void gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket);
-extern void gpmon_send(gpmon_packet_t*);
-extern void gpmon_gettmid(int32*);
-
 /* ------------------------------------------------------------------
          FSINFO
    ------------------------------------------------------------------ */

From 040cc5fe39b89936417b8b036874ec8632ae54d0 Mon Sep 17 00:00:00 2001
From: Hao Wu <gfphoenix78@gmail.com>
Date: Tue, 19 Nov 2024 08:28:41 +0000
Subject: [PATCH 21/40] Remove with(fillfactor=100) that incompatible with
 current behavior

---
 contrib/perfmon/perfmon.sql | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/contrib/perfmon/perfmon.sql b/contrib/perfmon/perfmon.sql
index fb457d05074..642062de50e 100644
--- a/contrib/perfmon/perfmon.sql
+++ b/contrib/perfmon/perfmon.sql
@@ -51,7 +51,6 @@ create table public.system_history (
        net_rb_rate bigint not null,  -- system net read bytes per second
        net_wb_rate bigint not null   -- system net write bytes per second
 ) 
-with (fillfactor=100)
 distributed by (ctime)
 partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
 
@@ -103,7 +102,6 @@ create table public.queries_history (
        disk_read  bigint not null,  -- disk read for all processes executing query
        disk_write bigint not null   -- disk write for all processes executing query
 )
-with (fillfactor=100)
 distributed by (ctime)
 partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
 
@@ -152,7 +150,6 @@ create table public.database_history (
        queries_running int not null, -- number of running queries
        queries_queued int not null -- number of queued queries
 ) 
-with (fillfactor=100)
 distributed by (ctime)
 partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
 
@@ -186,7 +183,7 @@ format 'csv' (delimiter '|');
 --   hostname                   hostname of system this metric belongs to
 --   dynamic_memory_used        bytes of dynamic memory used by the segment
 --   dynamic_memory_available   bytes of dynamic memory available for use by the segment
-create table public.segment_history (ctime timestamptz(0) not null, dbid int not null, hostname varchar(64) not null, dynamic_memory_used bigint not null, dynamic_memory_available bigint not null) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+create table public.segment_history (ctime timestamptz(0) not null, dbid int not null, hostname varchar(64) not null, dynamic_memory_used bigint not null, dynamic_memory_available bigint not null) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
 
 -- TABLE: segment_now
 --   (like segment_history)
@@ -218,7 +215,7 @@ CREATE VIEW public.memory_info as select public.system_history.ctime, public.sys
 --   total_bytes                bytes total in filesystem
 --   bytes_used                 bytes used in the filesystem
 --   bytes_available            bytes available in the filesystem
-create table public.diskspace_history (ctime timestamptz(0) not null, hostname varchar(64) not null, filesystem text not null, total_bytes bigint not null, bytes_used bigint not null, bytes_available bigint not null) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+create table public.diskspace_history (ctime timestamptz(0) not null, hostname varchar(64) not null, filesystem text not null, total_bytes bigint not null, bytes_used bigint not null, bytes_available bigint not null) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
 
 --- TABLE: diskspace_now
 --   (like diskspace_history)
@@ -253,7 +250,7 @@ create external web table public._diskspace_tail (like public.diskspace_history)
 -- transmit_collision_errors bigint,
 -- transmit_carrier_errors bigint,
 -- transmit_compressed_packets int
-create table public.network_interface_history ( ctime timestamptz(0) not null, hostname varchar(64) not null, interface_name varchar(64) not null, bytes_received bigint, packets_received bigint, receive_errors bigint, receive_drops bigint, receive_fifo_errors bigint, receive_frame_errors bigint, receive_compressed_packets int, receive_multicast_packets int, bytes_transmitted bigint, packets_transmitted bigint, transmit_errors bigint, transmit_drops bigint, transmit_fifo_errors bigint, transmit_collision_errors bigint, transmit_carrier_errors bigint, transmit_compressed_packets int) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
+create table public.network_interface_history ( ctime timestamptz(0) not null, hostname varchar(64) not null, interface_name varchar(64) not null, bytes_received bigint, packets_received bigint, receive_errors bigint, receive_drops bigint, receive_fifo_errors bigint, receive_frame_errors bigint, receive_compressed_packets int, receive_multicast_packets int, bytes_transmitted bigint, packets_transmitted bigint, transmit_errors bigint, transmit_drops bigint, transmit_fifo_errors bigint, transmit_collision_errors bigint, transmit_carrier_errors bigint, transmit_compressed_packets int) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month'));
 
 --- TABLE: network_interface_now
 --   (like network_interface_history)
@@ -283,7 +280,7 @@ create external web table public._network_interface_tail (like public.network_in
 -- frag_sockets_inuse int,
 -- frag_sockets_memusage_inbytes int
 
-create table public.socket_history ( ctime timestamptz(0) not null, hostname varchar(64) not null, total_sockets_used int, tcp_sockets_inuse int, tcp_sockets_orphan int, tcp_sockets_timewait int, tcp_sockets_alloc int, tcp_sockets_memusage_inbytes int, udp_sockets_inuse int, udp_sockets_memusage_inbytes int, raw_sockets_inuse int, frag_sockets_inuse int, frag_sockets_memusage_inbytes int) with (fillfactor=100) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month')); 
+create table public.socket_history ( ctime timestamptz(0) not null, hostname varchar(64) not null, total_sockets_used int, tcp_sockets_inuse int, tcp_sockets_orphan int, tcp_sockets_timewait int, tcp_sockets_alloc int, tcp_sockets_memusage_inbytes int, udp_sockets_inuse int, udp_sockets_memusage_inbytes int, raw_sockets_inuse int, frag_sockets_inuse int, frag_sockets_memusage_inbytes int) distributed by (ctime) partition by range (ctime)(start (date '2010-01-01') end (date '2010-02-01') EVERY (interval '1 month')); 
 
 --- TABLE: socket_now
 --   (like socket_history)
@@ -413,4 +410,4 @@ CREATE FUNCTION query_state_pause_command()
 CREATE FUNCTION query_state_resume_command()
 	RETURNS void
 	AS 'MODULE_PATHNAME'
-	LANGUAGE C STRICT VOLATILE;
\ No newline at end of file
+	LANGUAGE C STRICT VOLATILE;

From d8ac2ee295fc421a906f150089837d48d5808d7c Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Tue, 12 Nov 2024 17:48:53 +0800
Subject: [PATCH 22/40] [perfmon] Fix some issues in gpmmon

- write the time with timezone
- set the default value of tsubmit/tstart/tfinish to -infinity
- modify agg_dup
    agg_dup is used to copy the unfinished query into a new hash table
and clean the old one. It reads the query text file and gets status
from it. From the status, it can know if the query is finished. But
the queries in the hash table is a little behand the status from the
query text file , using the status to filter query may lead some issues
: finished query is not recorded or the tfinish time is empty. So we use
the status from the hash table to check the query.
    For query is recored or executed time less than the config time,
 just clean it.
---
 contrib/perfmon/src/common/gpmonlib.c  |  11 +--
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 104 ++++++-------------------
 contrib/perfmon/src/include/gpmonlib.h |   2 +-
 3 files changed, 29 insertions(+), 88 deletions(-)

diff --git a/contrib/perfmon/src/common/gpmonlib.c b/contrib/perfmon/src/common/gpmonlib.c
index b56ab60de9e..e7c7d5c0e5c 100644
--- a/contrib/perfmon/src/common/gpmonlib.c
+++ b/contrib/perfmon/src/common/gpmonlib.c
@@ -300,12 +300,9 @@ char* gpmon_datetime(time_t t, char str[GPMON_DATE_BUF_SIZE])
 		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "localtime_r failed");
 		return str;
 	}
+	strftime(str, GPMON_DATE_BUF_SIZE - 1, "%Y-%m-%d %H:%M:%S%z", &tm);
 
-	snprintf(str, GPMON_DATE_BUF_SIZE, "%04d-%02d-%02d %02d:%02d:%02d",
-	    1900 + tm.tm_year, tm.tm_mon + 1, tm.tm_mday,
-	    tm.tm_hour, tm.tm_min, tm.tm_sec);
-
-    	return str;
+	return str;
 }
 
 /* datetime, e.g. 2004-02-14  23:50:10
@@ -322,9 +319,7 @@ char* gpmon_datetime_rounded(time_t t, char str[GPMON_DATE_BUF_SIZE])
 		return str;
 	}
 
-	snprintf(str, GPMON_DATE_BUF_SIZE, "%04d-%02d-%02d %02d:%02d:%02d",
-	1900 + tm.tm_year, tm.tm_mon + 1, tm.tm_mday,
-		tm.tm_hour, tm.tm_min, ((tm.tm_sec/5)*5));
+	strftime(str, GPMON_DATE_BUF_SIZE - 1, "%Y-%m-%d %H:%M:%S%z", &tm);
 
 	return str;
 }
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index ca9c197d4e8..dd8ce9f7fbe 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -75,7 +75,7 @@ int32 tmid = -1;
 extern void incremement_tail_bytes(apr_uint64_t bytes);
 static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid,
 			apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool);
-void gpdb_get_spill_file_size_from_query(qdnode_t *qdnode);
+static void format_time(time_t tt, char *buf);
 
 /**
  * Disk space check helper function
@@ -511,30 +511,26 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 		void* vptr;
 		qdnode_t* dp;
 		qdnode_t* newdp;
-		apr_int32_t status;
 
 		apr_hash_this(hi, 0, 0, &vptr);
 		dp = vptr;
-
-		/* skip all entries that weren't updated recently and aren't waiting in a queue */
-		/* Read status from query text as this is reliable */
-		/* Todo Why read status from query text instead of dp?*/
-		status = get_query_status(dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt);
+		if (dp->recorded)
+			continue;
+		if ( (dp->qlog.status == GPMON_QLOG_STATUS_DONE || dp->qlog.status == GPMON_QLOG_STATUS_ERROR) &&
+				(dp->qlog.tfin > 0 && ((dp->qlog.tfin - dp->qlog.tstart) < min_query_time )))
+		{
+			TR2(("agg_dup: skip short query %d.%d.%d generation %d, current generation %d, recorded %d\n",
+						dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt,
+						(int) dp->last_updated_generation, (int) newagg->generation, dp->recorded));
+			continue;
+		}
+		if (dp->qlog.status == GPMON_QLOG_STATUS_INVALID || dp->qlog.status == GPMON_QLOG_STATUS_SILENT)
+			continue;
 
 		apr_int32_t age = newagg->generation - dp->last_updated_generation - 1;
 		if (age > 0)
 		{
-			if (status == GPMON_QLOG_STATUS_DONE  && dp->qlog.tfin > 0 && ((dp->qlog.tfin - dp->qlog.tstart) < min_query_time ))
-			{
-				TR2(("agg_dup: skip short query %d.%d.%d generation %d, current generation %d, recorded %d\n",
-							dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt,
-							(int) dp->last_updated_generation, (int) newagg->generation, dp->recorded));
-				continue;
-			}
-			if (  (status != GPMON_QLOG_STATUS_SUBMIT
-			       && status != GPMON_QLOG_STATUS_CANCELING
-			       && status != GPMON_QLOG_STATUS_START)
-			   || ((age % 5 == 0) /* don't call is_query_not_active every time because it's expensive */
+			if (((age % 5 == 0) /* don't call is_query_not_active every time because it's expensive */
 			       && is_query_not_active(dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt, active_query_tab, newagg->pool)))
 			{
 				if (dp->qlog.dbid != gpperfmon_dbid)
@@ -546,14 +542,6 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 				continue;
 			}
 		}
-                else if (dp->qlog.status == GPMON_QLOG_STATUS_DONE && status == GPMON_QLOG_STATUS_INVALID)
-                {
-                        continue;
-                }
-
-		/* check if we missed a status change */
-		if (dp->qlog.status != status)
-			dp->qlog.status = status;
 
 		if (dp->qlog.dbid != gpperfmon_dbid) {
 			TR2( ("agg_dup: add %d.%d.%d, generation %d, recorded %d:\n", dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt, (int) dp->last_updated_generation, dp->recorded));
@@ -567,29 +555,6 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 
 		*newdp = *dp;
 
-		// newdp->qexec_hash = apr_hash_make(newagg->pool);
-		// if (!newdp->qexec_hash) {
-		// 	agg_destroy(newagg);
-		// 	return APR_ENOMEM;
-		// }
-
-		// cnt = 0;
-		// // Copy the qexec hash table
-		// for (hj = apr_hash_first(newagg->pool, dp->qexec_hash); hj; hj = apr_hash_next(hj)) {
-		// 	mmon_qexec_t* new_qexec;
-		// 	apr_hash_this(hj, 0, 0, &vptr);
-
-		// 	//allocate the packet
-		// 	if (!(new_qexec = apr_pcalloc(newagg->pool, sizeof(mmon_qexec_t)))) {
-		// 		agg_destroy(newagg);
-		// 		return APR_ENOMEM;
-		// 	}
-		// 	*new_qexec = *((mmon_qexec_t*)vptr);
-
-		// 	apr_hash_set(newdp->qexec_hash, &(new_qexec->key.hash_key), sizeof(new_qexec->key.hash_key), new_qexec);
-		// 	TR2( ("\t    %d: (%d, %d)\n", ++cnt, new_qexec->key.hash_key.segid, new_qexec->key.hash_key.nid));
-		// }
-
 		newdp->query_seginfo_hash = apr_hash_make(newagg->pool);
 		if (!newdp->query_seginfo_hash) {
 			agg_destroy(newagg);
@@ -1551,6 +1516,14 @@ static char* replaceQuotes(char *str, apr_pool_t* pool, int* size) {
     return newStr;
 }
 
+static void format_time(time_t tt, char *buf)
+{
+	if (tt)
+		 gpmon_datetime(tt, buf);
+	else
+	 snprintf(buf, GPMON_DATE_BUF_SIZE, "-infinity");
+}
+
 static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nowstr, apr_pool_t* pool)
 {
         char timsubmitted[GPMON_DATE_BUF_SIZE];
@@ -1564,39 +1537,12 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
         int   fd_cnt;
         cpu_skew = get_cpu_skew(qdnode);
         qdnode->qlog.p_metrics.cpu_skew += cpu_skew;
-        //row_skew = get_row_skew(qdnode);
-        //rowsout = get_rowsout(qdnode);
 
         // get spill file size
         gpdb_get_spill_file_size_from_query(qdnode);
-
-        if (qdnode->qlog.tsubmit)
-        {
-                gpmon_datetime((time_t)qdnode->qlog.tsubmit, timsubmitted);
-        }
-        else
-        {
-                snprintf(timsubmitted, GPMON_DATE_BUF_SIZE, "null");
-        }
-
-        if (qdnode->qlog.tstart)
-        {
-                gpmon_datetime((time_t)qdnode->qlog.tstart, timstarted);
-        }
-        else
-        {
-                snprintf(timstarted, GPMON_DATE_BUF_SIZE, "null");
-        }
-
-        if (qdnode->qlog.tfin)
-        {
-                gpmon_datetime((time_t)qdnode->qlog.tfin, timfinished);
-        }
-        else
-        {
-                snprintf(timfinished, GPMON_DATE_BUF_SIZE,  "null");
-        }
-
+		format_time(qdnode->qlog.tsubmit, timsubmitted);
+		format_time(qdnode->qlog.tstart, timstarted);
+		format_time(qdnode->qlog.tfin, timfinished);
 
         if (qdnode->num_metrics_packets)
         {
diff --git a/contrib/perfmon/src/include/gpmonlib.h b/contrib/perfmon/src/include/gpmonlib.h
index 2cf9fab5d0c..5552970a39c 100644
--- a/contrib/perfmon/src/include/gpmonlib.h
+++ b/contrib/perfmon/src/include/gpmonlib.h
@@ -43,7 +43,7 @@ extern int verbose;
 	#define GPSMON_METRIC_MAX 0xffffffffUL
 #endif
 
-#define GPMON_DATE_BUF_SIZE 24
+#define GPMON_DATE_BUF_SIZE 30
 extern Oid gpperfmon_dbid;
 
 

From 45a888ab2f957afe409f69e0bb2e5af13cc95754 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 22 Nov 2024 19:33:21 +0800
Subject: [PATCH 23/40] [perfmon] Remove unused code

---
 contrib/perfmon/src/gpmmon/gpmmon.c    |  59 ------
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 275 -------------------------
 contrib/perfmon/src/gpmmon/gpmondb.c   | 160 --------------
 3 files changed, 494 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
index 3de9897451e..33881ebb85c 100644
--- a/contrib/perfmon/src/gpmmon/gpmmon.c
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -849,50 +849,6 @@ static void* harvest_main(apr_thread_t* thread_, void* arg_)
 }
 
 
-/* Separate thread for message sending */
-/* As gp_elog has been removed, disable this function */
-/*
-static void* message_main(apr_thread_t* thread_, void* arg_)
-{
-	apr_queue_t *queue = arg_;
-	void *query = NULL;
-	apr_status_t status;
-
-	TR2(("In message_main: error_disk_space_percentage = %d, warning_disk_space_percentage = %d, disk_space_interval = %d, max_disk_space_messages_per_interval = %d\n",
-		 opt.error_disk_space_percentage, opt.warning_disk_space_percentage, (int) opt.disk_space_interval, opt.max_disk_space_messages_per_interval));
-	while (1)
-	{
-		query = NULL;
-		status = apr_queue_pop(queue, &query);
-		if (status == APR_EINTR)
-		{ //the blocking operation was interrupted (try again)
-			continue;
-		}
-		else if (status != APR_SUCCESS)
-		{
-			interuptable_sleep(30); // sleep to prevent loop of forking process and failing
-			gpmon_fatalx(
-				FLINE, status, "message_main ERROR: apr_queue_pop failed: returned %d", status);
-			return (void*)1;
-		}
-		else if (NULL == query)
-		{
-			TR0(("message_main ERROR: apr_queue_pop returned NULL\n"));
-		}
-		else
-		{ // send the message
-			if (!gpdb_exec_search_for_at_least_one_row((const char *)query, NULL))
-			{
-				TR0(("message_main ERROR: query %s failed. Cannot send message\n", (char *) query));
-			}
-			free(query);
-		}
-
-	}
-	return APR_SUCCESS;
-}
-*/
-
 time_t compute_next_dump_to_file()
 {
 	time_t current_time = time(NULL);
@@ -1031,21 +987,6 @@ static void gpmmon_main(void)
 		gpmon_fatalx(FLINE, e, "apr_thread_create failed");
 	}
 
-	/* gp_elog has been removed in hashdata-lightning */
-	///* Create message queue */
-	//if (0 != (e = apr_queue_create(&message_queue, MAX_MESSAGES_PER_INTERVAL, ax.pool)))
-	//{
-	//	interuptable_sleep(30); // sleep to prevent loop of forking process and failing
-	//	gpmon_fatalx(FLINE, e, "apr_queue_create failed");
-	//}
-
-	///* spawn disk space message thread */
-	//if (0 != (e = apr_thread_create(&message_th, ta, message_main, message_queue, ax.pool)))
-	//{
-	//	interuptable_sleep(30); // sleep to prevent loop of forking process and failing
-	//	gpmon_fatalx(FLINE, e, "apr_thread_create failed");
-	//}
-
 	/* main loop */
 	while (!ax.exit)
 	{
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index dd8ce9f7fbe..1fb23820242 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -77,92 +77,6 @@ static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid,
 			apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool);
 static void format_time(time_t tt, char *buf);
 
-/**
- * Disk space check helper function
- * Note- trys to push a message on a queue so that the message thread can send the message
- */
-/* gp_elog has been removed */
-/*
-static apr_status_t  check_disk_space(mmon_fsinfo_t* rec)
-{
-	static time_t interval_start_time = 0;
-	static unsigned int number_messages_sent_this_interval = 0;
-	time_t now = 0;
-	int used_disk_space_percent =  ROUND_DIVIDE((rec->bytes_used *100),rec->bytes_total);
-
-	now = time(NULL);
-	// reset the interval if needed
-	if ((now - interval_start_time) >= opt.disk_space_interval){
-		interval_start_time = now;
-		number_messages_sent_this_interval = 0;
-	}
-
-	// Check the disk space if we haven't already sent an error
-	if (rec->sent_error_flag != DISK_SPACE_ERROR_SENT) {
-		disk_space_message_t send_flag = DISK_SPACE_NO_MESSAGE_SENT;
-		char* message = 0;
-
-		// check for errors and then warnings
-		if ((opt.error_disk_space_percentage != 0) && (used_disk_space_percent >= opt.error_disk_space_percentage)) {
-			//Send an error if the error_disk_space_percentage threshold is set and the used_disk_space_percent is greater or equal to it
-			send_flag = DISK_SPACE_ERROR_SENT;
-			message = "ERROR";
-		} else if ((rec->sent_error_flag != DISK_SPACE_WARNING_SENT) && (opt.warning_disk_space_percentage != 0 ) &&
-					(used_disk_space_percent >= opt.warning_disk_space_percentage)) {
-			//Send warning if the warning_disk_space_percentage threshold is set and the used_disk_space_percent is greater or equal to it
-			//and if a warning has not already been sent
-			send_flag = DISK_SPACE_WARNING_SENT;
-			message = "WARNING";
-		} else if ((rec->sent_error_flag == DISK_SPACE_WARNING_SENT) && (used_disk_space_percent < opt.warning_disk_space_percentage)) {
-			//if a warning as been sent and the used disk has fallen below the below the warning threshold reset the send flag
-			rec->sent_error_flag = DISK_SPACE_NO_MESSAGE_SENT;
-		}
-
-		// Send a warning or error if needed by putting the message in a queue
-		if (send_flag != DISK_SPACE_NO_MESSAGE_SENT){
-			//only sent the message if
-			if (number_messages_sent_this_interval < opt.max_disk_space_messages_per_interval) {
-				char *query;
-				apr_status_t status;
-				unsigned int query_size_max = NAMEDATALEN + GPMON_FSINFO_MAX_PATH + 200;
-
-				query = malloc(query_size_max);
-				if (!query) {
-					TR0(("check_disk_space ERROR: malloc(%d) returned NULL, out of memory\n", query_size_max));
-					return APR_ENOMEM;
-				}
-				snprintf(query, query_size_max, "select gp_elog('%s: percent used disk space for %s %s is %d%%', True)",
-						message, rec->key.hostname, rec->key.fsname, used_disk_space_percent);
-
-				status = apr_queue_trypush(message_queue, (void *) query);
-				if (status == APR_EINTR) { //blocking interrupted try one more time
-					status = apr_queue_trypush(message_queue, (void *) query);
-				}
-				if (status != APR_SUCCESS) {
-					TR0(("check_disk_space ERROR: apr_queue_trypush returned %d; cannot send %s\n", status, query));
-					free(query);
-				} else {
-					number_messages_sent_this_interval++;
-				}
-
-			} else {
-				TR1(("check_disk_space: message max reached: Not sending message for %s %s. used_disk_space_percent = %d%%\n", rec->key.hostname, rec->key.fsname, used_disk_space_percent));
-			}
-
-			rec->sent_error_flag = send_flag;
-		}
-
-	} else if ( ( opt.warning_disk_space_percentage != 0 ) && ( used_disk_space_percent < opt.warning_disk_space_percentage )) {
-		//if there is a warning percent to check and the used disk has fallen below the below the warning threshold reset the send flag
-		rec->sent_error_flag = DISK_SPACE_NO_MESSAGE_SENT;
-	} else if ( ( opt.warning_disk_space_percentage == 0 ) && ( used_disk_space_percent < opt.error_disk_space_percentage )) {
-		//if there is no warning percent to check and the used disk has fallen below the below the error threshold reset the send flag
-		rec->sent_error_flag = DISK_SPACE_NO_MESSAGE_SENT;
-	}
-	return 0;
-}
-*/
-
 static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool)
 {
 	// get active query of session
@@ -287,28 +201,6 @@ static apr_status_t agg_put_metrics(agg_t* agg, const gpmon_metrics_t* met)
 	return 0;
 }
 
-// static apr_status_t agg_put_segment(agg_t* agg, const gpmon_seginfo_t* seg)
-// {
-// 	gpmon_seginfo_t* rec;
-
-// 	rec = apr_hash_get(agg->stab, &seg->dbid, sizeof(seg->dbid));
-// 	if (rec)
-// 	{
-// 		*rec = *seg;
-// 	}
-// 	else
-// 	{
-// 		rec = apr_palloc(agg->pool, sizeof(*rec));
-// 		if (!rec)
-// 		{
-// 			return APR_ENOMEM;
-// 		}
-// 		*rec = *seg;
-// 		apr_hash_set(agg->stab, &rec->dbid, sizeof(rec->dbid), rec);
-// 	}
-// 	return 0;
-// }
-
 static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation)
 {
 	gpmon_qlogkey_t key = qlog->key;
@@ -399,51 +291,6 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 	return 0;
 }
 
-
-// static apr_status_t agg_put_qexec(agg_t* agg, const qexec_packet_t* qexec_packet, apr_int64_t generation)
-// {
-// 	qdnode_t* dp;
-// 	gpmon_qlogkey_t key;
-// 	mmon_qexec_t* mmon_qexec_existing = 0;
-
-// 	/* find qdnode of this qexec */
-// 	key.tmid = qexec_packet->data.key.tmid;
-// 	key.ssid = qexec_packet->data.key.ssid;
-// 	key.ccnt = qexec_packet->data.key.ccnt;
-// 	dp = apr_hash_get(agg->qtab, &key, sizeof(key));
-
-// 	if (!dp) { /* not found, internal SPI query.  Ignore. */
-// 		return 0;
-// 	}
-
-// 	mmon_qexec_existing = apr_hash_get(dp->qexec_hash, &qexec_packet->data.key.hash_key, sizeof(qexec_packet->data.key.hash_key));
-
-// 	/* if found, replace it */
-// 	if (mmon_qexec_existing) {
-// 		mmon_qexec_existing->key.ccnt = qexec_packet->data.key.ccnt;
-// 		mmon_qexec_existing->key.ssid = qexec_packet->data.key.ssid;
-// 		mmon_qexec_existing->key.tmid = qexec_packet->data.key.tmid;
-// 		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
-// 		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
-// 		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
-// 	}
-// 	else {
-// 		/* not found, make new hash entry */
-// 		if (! (mmon_qexec_existing = apr_palloc(agg->pool, sizeof(mmon_qexec_t))))
-// 			return APR_ENOMEM;		
-
-// 		memcpy(&mmon_qexec_existing->key, &qexec_packet->data.key, sizeof(gpmon_qexeckey_t));
-// 		mmon_qexec_existing->_cpu_elapsed = qexec_packet->data._cpu_elapsed;
-// 		mmon_qexec_existing->measures_rows_in = qexec_packet->data.measures_rows_in;
-// 		mmon_qexec_existing->rowsout = qexec_packet->data.rowsout;
-// 		apr_hash_set(dp->qexec_hash, &mmon_qexec_existing->key.hash_key, sizeof(mmon_qexec_existing->key.hash_key), mmon_qexec_existing);
-// 	}
-
-// 	dp->last_updated_generation = generation;
-// 	return 0;
-// }
-
-
 apr_status_t agg_create(agg_t** retagg, apr_int64_t generation, apr_pool_t* parent_pool, apr_hash_t* fsinfotab)
 {
 	int e;
@@ -595,13 +442,6 @@ apr_status_t agg_put(agg_t* agg, const gp_smon_to_mmon_packet_t* pkt)
 		return agg_put_metrics(agg, &pkt->u.metrics);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QLOG)
 		return agg_put_qlog(agg, &pkt->u.qlog, agg->generation);
-	/*
-	hashdata-lightning not use
-	if (pkt->header.pkttype == GPMON_PKTTYPE_QEXEC)
-		return agg_put_qexec(agg, &pkt->u.qexec_packet, agg->generation);
-	if (pkt->header.pkttype == GPMON_PKTTYPE_SEGINFO)
-		return agg_put_segment(agg, &pkt->u.seginfo);
-	*/
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QUERY_HOST_METRICS)
 		return agg_put_query_metrics(agg, &pkt->u.qlog, agg->generation);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_FSINFO)
@@ -655,20 +495,11 @@ apr_status_t agg_dump(agg_t* agg)
 	bloom_set(&bloom, GPMON_DIR "queries_tail.dat");
 	bloom_set(&bloom, GPMON_DIR "queries_stage.dat");
 	bloom_set(&bloom, GPMON_DIR "_queries_tail.dat");
-	bloom_set(&bloom, GPMON_DIR "database_now.dat");
-	bloom_set(&bloom, GPMON_DIR "database_tail.dat");
-	bloom_set(&bloom, GPMON_DIR "database_stage.dat");
-	bloom_set(&bloom, GPMON_DIR "_database_tail.dat");
-	bloom_set(&bloom, GPMON_DIR "segment_now.dat");
-	bloom_set(&bloom, GPMON_DIR "segment_tail.dat");
-	bloom_set(&bloom, GPMON_DIR "segment_stage.dat");
-	bloom_set(&bloom, GPMON_DIR "_segment_tail.dat");
 	bloom_set(&bloom, GPMON_DIR "diskspace_now.dat");
 	bloom_set(&bloom, GPMON_DIR "diskspace_tail.dat");
 	bloom_set(&bloom, GPMON_DIR "diskspace_stage.dat");
 	bloom_set(&bloom, GPMON_DIR "_diskspace_tail.dat");
 
-
 	/* dump metrics */
 	temp_bytes_written = write_system(agg, nowstr);
 	incremement_tail_bytes(temp_bytes_written);
@@ -1068,29 +899,6 @@ static apr_uint32_t write_system(agg_t* agg, const char* nowstr)
 	return bytes_written;
 }
 
-// static apr_int64_t get_rowsout(qdnode_t* qdnode)
-// {
-
-// 	apr_hash_index_t *hi;
-// 	//qenode_t* pqe = NULL;
-// 	apr_int64_t rowsout = 0;
-// 	void* valptr;
-// 	mmon_query_seginfo_t *query_seginfo;
-
-// 	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
-// 	{
-// 		apr_hash_this(hi, 0, 0, &valptr);
-// 		query_seginfo = (mmon_query_seginfo_t*) valptr;
-// 		if (query_seginfo->final_rowsout != -1)
-// 		{
-// 			rowsout = query_seginfo->final_rowsout;
-// 			break;
-// 		}
-// 	}
-// 	return rowsout;
-// }
-
-
 static void _get_sum_seg_info(apr_hash_t* segtab, apr_int64_t* total_data_out, int* segcount_out)
 {
 	apr_hash_index_t *hi;
@@ -1207,89 +1015,6 @@ static double get_cpu_skew(qdnode_t* qdnode)
 	return coefficient_of_variation;
 }
 
-// static double get_row_skew(qdnode_t* qdnode)
-// {
-// 	apr_pool_t* tmp_pool;
-// 	apr_hash_t* segtab;
-// 	apr_hash_index_t *hi;
-//
-// 	apr_int64_t total_row_out = 0;
-// 	apr_int64_t total_deviation_squared = 0;
-// 	double variance = 0.0f;
-// 	double standard_deviation = 0;
-// 	double coefficient_of_variation = 0;
-// 	apr_int64_t row_out_avg = 0;
-// 	apr_int64_t* seg_row_out_sum = NULL;
-// 	void* valptr;
-//
-// 	int segcnt = 0;
-// 	int e;
-//
-// 	if (!qdnode)
-// 		return 0.0f;
-//
-// 	if (0 != (e = apr_pool_create_alloc(&tmp_pool, 0)))
-// 	{
-// 		gpmon_warningx(FLINE, e, "apr_pool_create_alloc failed");
-// 		return 0.0f;
-// 	}
-//
-// 	segtab = apr_hash_make(tmp_pool);
-// 	if (!segtab)
-// 	{
-// 		gpmon_warning(FLINE, "Out of memory");
-// 		return 0.0f;
-// 	}
-//
-// 	/* Calc rows in sum per segment */
-// 	TR2(("Calc rows in sum  per segment\n"));
-// 	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
-// 	{
-// 		mmon_query_seginfo_t	*rec;
-// 		apr_hash_this(hi, 0, 0, &valptr);
-// 		rec = (mmon_query_seginfo_t*) valptr;
-//
-// 		if (rec->key.segid == -1)
-// 			continue;
-//
-// 		seg_row_out_sum = apr_hash_get(segtab, &rec->key.segid, sizeof(rec->key.segid));
-//
-// 		if (!seg_row_out_sum) {
-// 			seg_row_out_sum = apr_palloc(tmp_pool, sizeof(apr_int64_t));
-// 			*seg_row_out_sum = 0;
-// 		}
-// 		*seg_row_out_sum += rec->sum_measures_rows_out;
-// 		apr_hash_set(segtab, &rec->key.segid, sizeof(rec->key.segid), seg_row_out_sum);
-// 	}
-//
-// 	_get_sum_seg_info(segtab, &total_row_out, &segcnt);
-//
-// 	if (!segcnt) {
-// 		TR2(("No segments for Rows skew calculation\n"));
-// 		apr_pool_destroy(tmp_pool);
-// 		return 0.0f;
-// 	}
-//
-// 	row_out_avg = total_row_out / segcnt;
-//
-// 	TR2(("(SKEW) Avg rows out: %" FMT64 "\n", row_out_avg));
-//
-// 	_get_sum_deviation_squared(segtab, row_out_avg, &total_deviation_squared);
-//
-// 	variance = total_deviation_squared / (double)segcnt;
-// 	standard_deviation = sqrt(variance);
-//
-// 	TR2(("(SKEW) Rows in standard deviaton: %f\n", standard_deviation));
-//
-// 	coefficient_of_variation = row_out_avg ? standard_deviation/(double)row_out_avg : 0.0f;
-//
-// 	apr_pool_destroy(tmp_pool);
-// 	TR2(("(SKEW) Rows out skew: %f\n", coefficient_of_variation));
-//
-// 	return coefficient_of_variation;
-// }
-
-
 static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const char* nowstr, apr_uint32_t done)
 {
 	char timsubmitted[GPMON_DATE_BUF_SIZE];
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.c b/contrib/perfmon/src/gpmmon/gpmondb.c
index 3c34c181404..e1c7348894c 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.c
+++ b/contrib/perfmon/src/gpmmon/gpmondb.c
@@ -1151,166 +1151,6 @@ apr_status_t gpdb_harvest(void)
 	return call_for_each_table(harvest, NULL, NULL);
 }
 
-//static bool gpdb_insert_alert_log()
-//{
-//	PGconn* conn = 0;
-//	PGresult* result = 0;
-//	const char* QRY = "insert into log_alert_history select * from log_alert_tail;";
-//	const char* errmsg;
-//	errmsg = gpdb_exec(&conn, &result, QRY);
-//
-//	bool success = true;
-//	if (errmsg)
-//	{
-//		gpmon_warningx(
-//			FLINE, 0,
-//			"---- ARCHIVING HISTORICAL ALERT DATA FAILED ---- on query %s with error %s\n",
-//			QRY, errmsg);
-//		success = false;
-//	}
-//	else
-//	{
-//		TR1(("load completed OK: alert_log\n"));
-//	}
-//
-//	PQclear(result);
-//	PQfinish(conn);
-//	return success;
-//}
-
-//static void gpdb_remove_success_files(apr_array_header_t *success_append_files, apr_pool_t *pool)
-//{
-//	void *file_slot = NULL;
-//	while ((file_slot = apr_array_pop(success_append_files)))
-//	{
-//		const char *file_name = (*(char**)file_slot);
-//		if (file_name)
-//		{
-//			if (apr_file_remove(file_name, pool) != APR_SUCCESS)
-//			{
-//				gpmon_warningx(FLINE, 0, "failed removing file:%s", file_name);
-//			}
-//		}
-//	}
-//}
-
-//static int cmp_string(const void *left, const void *right)
-//{
-//	const char *op1 = *(const char**)left;
-//	const char *op2 = *(const char**)right;
-//	return strcmp(op1, op2);
-//}
-
-// Find all files start with 'gpdb-alert' under GPMON_LOG directory, sort it and
-// remove the latest one 'gpdb-alert-*.csv' as it is still used by GPDB.
-//static void get_alert_log_tail_files(apr_array_header_t *tail_files, apr_pool_t *pool)
-//{
-//	apr_dir_t *dir;
-//	apr_status_t status = apr_dir_open(&dir, GPMON_LOG, pool);
-//	if (status != APR_SUCCESS)
-//	{
-//		gpmon_warningx(FLINE, status, "failed opening directory:%s", GPMON_LOG);
-//		return;
-//	}
-//
-//	apr_finfo_t dirent;
-//	static const char gpdb_prefix[] = "gpdb-alert";
-//	while (apr_dir_read(&dirent, APR_FINFO_DIRENT, dir) == APR_SUCCESS)
-//	{
-//		if (strncmp(dirent.name, gpdb_prefix, sizeof(gpdb_prefix) - 1) == 0)
-//		{
-//			void *file_slot = apr_array_push(tail_files);
-//			if (! file_slot)
-//			{
-//				gpmon_warningx(FLINE, 0, "failed getting alert tail log:%s due to out of memory", dirent.name);
-//				continue;
-//			}
-//			(*(const char**)file_slot) = apr_pstrcat(pool, GPMON_LOG, "/", dirent.name, NULL);
-//		}
-//	}
-//
-//	// We only want to use qsort in stdlib.h, not the macro qsort in port.h.
-//	(qsort)(tail_files->elts, tail_files->nelts, tail_files->elt_size, cmp_string);
-//	(void)apr_array_pop(tail_files);
-//	apr_dir_close(dir);
-//}
-
-/* gp_elog has been moved */
-/*
-void gpdb_import_alert_log(apr_pool_t *pool)
-{
-	// Get alert log files to be imported.
-	apr_array_header_t* tail_files = apr_array_make(pool, 10, sizeof(char*));
-	apr_array_header_t* success_append_files = apr_array_make(pool, 10, sizeof(char*));
-	get_alert_log_tail_files(tail_files, pool);
-
-	// Create or truncate stage file.
-	char *dst_file = apr_pstrcat(pool, GPMON_LOG, "/", GPMON_ALERT_LOG_STAGE, NULL);
-	apr_status_t status = truncate_file(dst_file, pool);
-	if (status != APR_SUCCESS)
-	{
-	    gpmon_warningx(FLINE, 0, "failed truncating stage file:%s", dst_file);
-	    return;
-	}
-
-	// Append alert log tail file to stage file
-	void *tail_file = NULL;
-	while ((tail_file = apr_array_pop(tail_files)))
-	{
-		char *filename = *(char**)tail_file;
-		void *success_file_slot = apr_array_push(success_append_files);
-		if (!success_file_slot)
-		{
-			gpmon_warningx(
-				FLINE, 0, "failed appending file:%s to stage file:%s due to out of memory",
-				filename, dst_file);
-			continue;
-		}
-		(*(char**)success_file_slot) = NULL;
-
-	    status = apr_file_append(filename, dst_file, APR_FILE_SOURCE_PERMS, pool);
-	    if (status != APR_SUCCESS)
-	    {
-			gpmon_warningx(FLINE, status, "failed appending file:%s to stage file:%s", filename, dst_file);
-			continue;
-		}
-	    else
-	    {
-			(*(char**)success_file_slot) = filename;
-	    	TR1(("success appending file:%s to stage file:%s\n", filename, dst_file));
-	    }
-	}
-
-	// Insert tail file to history table.
-	if (!gpdb_insert_alert_log())
-	{
-		// Failure might happen on malformed log entries
-		time_t now;
-		char timestr[20];
-		char *bad_file;
-
-		// Copy failed log into separate file for user attention
-		now = time(NULL);
-		strftime(timestr, 20, "%Y-%m-%d_%H%M%S", localtime(&now));
-		bad_file = apr_pstrcat(pool, GPMON_LOG, "/", GPMON_ALERT_LOG_STAGE, "_broken_", timestr,  NULL);
-		if (apr_file_copy(dst_file, bad_file, APR_FPROT_FILE_SOURCE_PERMS, pool) == APR_SUCCESS)
-		{
-			gpmon_warningx(FLINE, status, "Staging file with broken entries is archived to %s", bad_file);
-		}
-		else
-		{
-			gpmon_warningx(FLINE, status, "failed copying stage file:%s to broken file:%s", dst_file, bad_file);
-		}
-	}
-
-	// Delete tail file regardless of load success, as keeping too many tail files
-	// might cause serious harm to the system
-	gpdb_remove_success_files(success_append_files, pool);
-	truncate_file(dst_file, pool);
-}
-*/
-
-
 /* insert _tail data into history table */
 apr_status_t gpdb_check_partitions(mmon_options_t *opt)
 {

From d8f671051ff5156eaa3c25a2ccba56e6ea3eccc9 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 29 Nov 2024 12:26:23 +0800
Subject: [PATCH 24/40] [perfmon] Let gpmmon to generate tmid instead of gpmon

more details see README.md queryid part
---
 contrib/perfmon/README.md              | 16 ++++++-
 contrib/perfmon/src/gpmmon/gpmmon.c    | 17 ++++++-
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 43 ++++++++++-------
 contrib/perfmon/src/gpmmon/gpmon_agg.h |  2 +-
 contrib/perfmon/src/gpmmon/gpmondb.h   |  2 +-
 contrib/perfmon/src/gpmon/gpmon.c      | 65 +++++++++-----------------
 contrib/perfmon/src/gpsmon/gpsmon.c    |  9 ++--
 contrib/perfmon/src/include/gpmon.h    |  6 +--
 8 files changed, 87 insertions(+), 73 deletions(-)

diff --git a/contrib/perfmon/README.md b/contrib/perfmon/README.md
index 1f1414290e1..7b3f8e2e182 100644
--- a/contrib/perfmon/README.md
+++ b/contrib/perfmon/README.md
@@ -35,4 +35,18 @@ Find more information about the architecture on [the wiki page](https://github.c
 			2) sudo /etc/hosts   # and separate out (re)definitions of 127.0.0.1, something like:
 				127.0.0.1	foo
 				127.0.0.1	localhost
-
+## Design
+### QueryId
+	QueryId is composed of tmid int32, sessionid int32 and ccnt int32 (gp_command_count) in the following format
+	`tmid-sessionid-ccnt`
+	sessionid and ccnt is from query, but tmid is generated by gpperfmon. It is actually the perfmon start time.
+	So when the perfmon or database restarted, even if the sessionid and ccnt can be repetitive, due to the different
+	tmid, the queryid is unique.
+	Now the tmid is generated by gpmmon because there is only one gpmmon in the whole cluster, gpmon and gpsmon
+	both have multiple nodes. This can gurantee the tmid is unique.
+	However, gpmon who produces the query info needs the queryid, but it cannot get the tmid from gpmmon. As in the
+	current cluster, the sessionid and ccnt can be the unique identification of a query, tmid is not necessary. So
+	in gpmon, we just set the tmid to 0. And when gpmmon receives the query, then it resets the tmid.
+	gpmon writes the query file named with `q0(tmid)-sessionid-ccnt`, when the cluster restarted, the old
+	files will be rewriten by the new files. That doesn't matter, as we don't care the old files after the cluster
+	restarted.
diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
index 33881ebb85c..e82d42f5968 100644
--- a/contrib/perfmon/src/gpmmon/gpmmon.c
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -121,6 +121,9 @@ apr_queue_t* message_queue = NULL;
 sigset_t unblocksig;
 sigset_t blocksig;
 
+/* tmid */
+int32 tmid = -1;
+
 extern int gpdb_exec_search_for_at_least_one_row(const char* QUERY, PGconn* persistant_conn);
 
 /* Function defs */
@@ -131,7 +134,7 @@ static apr_status_t sendpkt(int sock, const gp_smon_to_mmon_packet_t* pkt);
 static apr_status_t recvpkt(int sock, gp_smon_to_mmon_packet_t* pkt, bool loop_until_all_recv);
 
 static void def_gucs(void);
-
+static void init_tmid(void);
 
 #define MMON_LOG_FILENAME_SIZE (MAXPATHLEN+1)
 char mmon_log_filename[MMON_LOG_FILENAME_SIZE];
@@ -1447,7 +1450,9 @@ int perfmon_main(Datum arg)
 		}
 	}
 
-	//create_log_alert_table();
+	/* init tmid */
+	init_tmid();
+
 	gpmmon_main();
 
 	cleanup();
@@ -1799,3 +1804,11 @@ def_gucs(void)
 	DefineCustomBoolVariable("perfmon.enable", "Enable perfmon monitoring.", NULL,
 			&perfmon_enabled, false, PGC_POSTMASTER, 0, NULL, NULL, NULL);
 }
+
+static void
+init_tmid(void)
+{
+	time_t t;
+	t = time(NULL);
+	tmid = t;
+}
\ No newline at end of file
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 1fb23820242..70672bb6fff 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -70,12 +70,13 @@ typedef struct dbmetrics_t {
 extern int min_query_time;
 extern mmon_options_t opt;
 extern apr_queue_t* message_queue;
-int32 tmid = -1;
+extern int32 tmid;
 
 extern void incremement_tail_bytes(apr_uint64_t bytes);
 static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid,
 			apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool);
 static void format_time(time_t tt, char *buf);
+static void set_tmid(gp_smon_to_mmon_packet_t* pkt, int32 tmid);
 
 static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool)
 {
@@ -145,13 +146,10 @@ static apr_status_t agg_put_fsinfo(agg_t* agg, const gpmon_fsinfo_t* met)
 static apr_status_t agg_put_queryseg(agg_t* agg, const gpmon_query_seginfo_t* met, apr_int64_t generation)
 {
 	qdnode_t* dp;
-	gpmon_qlogkey_t key;
+	gpmon_qlogkey_t key = met->key.qkey;
 	mmon_query_seginfo_t* rec = 0;
 
 	/* find qdnode of this qexec */
-	key.tmid = met->key.qkey.tmid;
-	key.ssid = met->key.qkey.ssid;
-	key.ccnt = met->key.qkey.ccnt;
 	dp = apr_hash_get(agg->qtab, &key, sizeof(key));
 
 	if (!dp) { /* not found, internal SPI query.  Ignore. */
@@ -203,11 +201,9 @@ static apr_status_t agg_put_metrics(agg_t* agg, const gpmon_metrics_t* met)
 
 static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation)
 {
-	gpmon_qlogkey_t key = qlog->key;
-	key.tmid = tmid;
 	qdnode_t *node;
 
-	node = apr_hash_get(agg->qtab, &key, sizeof(key));
+	node = apr_hash_get(agg->qtab, &qlog->key, sizeof(qlog->key));
 	if (!node)
 	{
 		TR2(("put query metrics can not find qdnode from qtab, queryID :%d-%d-%d \n",
@@ -235,10 +231,6 @@ static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog,
 static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 				 apr_int64_t generation)
 {
-        if (tmid == -1)
-        {
-                tmid = qlog->key.tmid;
-        }
         if (qlog->dbid == gpperfmon_dbid) {
                 TR2(("agg_put_qlog:(%d.%d.%d) ignore gpperfmon sql\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt));
                 return 0;
@@ -436,8 +428,9 @@ void agg_destroy(agg_t* agg)
 	apr_pool_destroy(agg->pool);
 }
 
-apr_status_t agg_put(agg_t* agg, const gp_smon_to_mmon_packet_t* pkt)
+apr_status_t agg_put(agg_t* agg, gp_smon_to_mmon_packet_t* pkt)
 {
+	set_tmid(pkt, tmid);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_METRICS)
 		return agg_put_metrics(agg, &pkt->u.metrics);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QLOG)
@@ -1290,9 +1283,10 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
         char qfname[qfname_size];
         int size = 0;
         FILE* qfptr = 0;
-        snprintf(qfname, qfname_size, GPMON_DIR "q%d-%d-%d.txt", qdnode->qlog.key.tmid,
-                 qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt);
-        qfptr = fopen(qfname, "r");
+		snprintf(qfname, qfname_size, GPMON_DIR "q%d-%d-%d.txt", 0,
+				 qdnode->qlog.key.ssid,
+				 qdnode->qlog.key.ccnt);
+		qfptr = fopen(qfname, "r");
         if (qfptr)
         {
                 // array[0] query text
@@ -1372,3 +1366,20 @@ static int bloom_isset(bloom_t* bloom, const char* name)
     */
     return 0 != (bloom->map[idx] & (1 << off));
 }
+
+static void
+set_tmid(gp_smon_to_mmon_packet_t* pkt, int32 tmid)
+{
+
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QLOG ||
+		pkt->header.pkttype == GPMON_PKTTYPE_QUERY_HOST_METRICS)
+	{
+		gpmon_qlog_t* qlog = &(pkt->u.qlog);
+		qlog->key.tmid = tmid;
+	}
+	if (pkt->header.pkttype == GPMON_PKTTYPE_QUERYSEG)
+	{
+		gpmon_query_seginfo_t* met = &pkt->u.queryseg;
+		met->key.qkey.tmid = tmid;
+	}
+}
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.h b/contrib/perfmon/src/gpmmon/gpmon_agg.h
index 1da8d5b6318..c3757ea9724 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.h
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.h
@@ -8,7 +8,7 @@ typedef struct agg_t agg_t;
 apr_status_t agg_create(agg_t** retagg, apr_int64_t generation, apr_pool_t* parent_pool, apr_hash_t* fsinfotab);
 apr_status_t agg_dup(agg_t** agg, agg_t* oldagg, apr_pool_t* pool, apr_hash_t* fsinfotab);
 void agg_destroy(agg_t* agg);
-apr_status_t agg_put(agg_t* agg, const gp_smon_to_mmon_packet_t* pkt);
+apr_status_t agg_put(agg_t* agg, gp_smon_to_mmon_packet_t* pkt);
 apr_status_t agg_dump(agg_t* agg);
 typedef struct qdnode_t {
         apr_int64_t last_updated_generation;
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.h b/contrib/perfmon/src/gpmmon/gpmondb.h
index 75a48b8a38c..f05a48e5d7b 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.h
+++ b/contrib/perfmon/src/gpmmon/gpmondb.h
@@ -92,7 +92,7 @@ APR_DECLARE (void) create_log_alert_table(void);
 int find_token_in_config_string(char*, char**, const char*);
 void process_line_in_hadoop_cluster_info(apr_pool_t*, apr_hash_t*, char*, char*, char*);
 int get_hadoop_hosts_and_add_to_hosts(apr_pool_t*, apr_hash_t*, mmon_options_t*);
-void gpdb_get_spill_file_size_from_query(qdnode_t* qdnode);
+extern void gpdb_get_spill_file_size_from_query(qdnode_t* qdnode);
 apr_status_t truncate_file(char*, apr_pool_t*);
 
 #endif /* GPMONDB_H */
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 0fd96886ec1..2a46e37c85a 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -31,7 +31,6 @@
 #include "pg_query_state.h"
 
 PG_MODULE_MAGIC;
-static int32 init_tmid = -1;;
 
 void _PG_init(void);
 void _PG_fini(void);
@@ -43,8 +42,8 @@ static void gpmon_record_kv_with_file(const char* key,
 				  const char* value,
 				  bool extraNewLine,
 				  FILE* fp);
-static void gpmon_record_update(int32 tmid, int32 ssid,
-								int32 ccnt, int32 status);
+static void gpmon_record_update(gpmon_qlogkey_t key,
+						 int32 status);
 static const char* gpmon_null_subst(const char* input);
 
 /* gpmon hooks */
@@ -70,7 +69,7 @@ static void gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, QueryDesc *qd, boo
 static void gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket, QueryDesc *qd);
 static void gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket, QueryDesc *qd);
 static void gpmon_send(gpmon_packet_t*);
-static void gpmon_gettmid(int32*);
+static inline void set_query_key(gpmon_qlogkey_t *key, int32 ccnt);
 
 struct  {
     int    gxsock;
@@ -168,13 +167,14 @@ static void gpmon_record_kv_with_file(const char* key,
 	}
 }
 
-void gpmon_record_update(int32 tmid, int32 ssid, int32 ccnt,
+static void
+gpmon_record_update(gpmon_qlogkey_t key,
 						 int32 status)
 {
 	char fname[GPMON_DIR_MAX_PATH];
 	FILE *fp;
 
-	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, tmid, ssid, ccnt);
+	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, key.tmid, key.ssid, key.ccnt);
 
 	fp = fopen(fname, "r+");
 
@@ -188,12 +188,13 @@ void gpmon_record_update(int32 tmid, int32 ssid, int32 ccnt,
 	fclose(fp);
 }
 
-static void
-gpmon_gettmid(int32* tmid)
+static inline void
+set_query_key(gpmon_qlogkey_t *key, int32 ccnt)
 {
-	Assert(init_tmid > -1);
-	*tmid = init_tmid;
-} 
+	key->tmid = 0;
+	key->ssid = gp_session_id;
+	key->ccnt =  ccnt;
+}
 
 static void
 gpmon_send(gpmon_packet_t* p)
@@ -207,15 +208,15 @@ gpmon_send(gpmon_packet_t* p)
 		elog(DEBUG1,
 				"[perfmon] Perfmon Executor Packet: (tmid, ssid, ccnt, segid, pid, nid, status) = "
 				"(%d, %d, %d, %d, %d, %d, %d)",
-				p->u.qexec.key.tmid, p->u.qexec.key.ssid, p->u.qexec.key.ccnt,
+				p->u.qexec.key.qkey.tmid, p->u.qexec.key.qkey.ssid, p->u.qexec.key.qkey.ccnt,
 				p->u.qexec.key.hash_key.segid, p->u.qexec.key.hash_key.pid, p->u.qexec.key.hash_key.nid,
 				p->u.qexec.status);
 	}
-	
+
 	if (gpmon.gxsock > 0) {
 		int n = sizeof(*p);
-		if (n != sendto(gpmon.gxsock, (const char *)p, n, 0, 
-						(struct sockaddr*) &gpmon.gxaddr, 
+		if (n != sendto(gpmon.gxsock, (const char *)p, n, 0,
+						(struct sockaddr*) &gpmon.gxaddr,
 						sizeof(gpmon.gxaddr))) {
 			elog(LOG, "[perfmon]: cannot send (%m socket %d)", gpmon.gxsock);
 		}
@@ -252,7 +253,7 @@ gpmon_qlog_packet_init(QueryDesc *qd)
 	gpmonPacket->pkttype = GPMON_PKTTYPE_QLOG;
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_SILENT;
 
-	gpmon_gettmid(&gpmonPacket->u.qlog.key.tmid);
+	set_query_key(&gpmonPacket->u.qlog.key, get_query_command_count(qd));
 	gpmonPacket->u.qlog.key.ssid = gp_session_id;
 	gpmonPacket->u.qlog.pid = MyProcPid;
 
@@ -262,8 +263,6 @@ gpmon_qlog_packet_init(QueryDesc *qd)
 			username ? username : "");
 	gpmonPacket->u.qlog.dbid = MyDatabaseId;
 
-	/* Fix up command count */
-	gpmonPacket->u.qlog.key.ccnt = get_query_command_count(qd);
 	return gpmonPacket;
 }
 
@@ -286,10 +285,8 @@ gpmon_qexec_packet_init()
 	gpmonPacket->version = GPMON_PACKET_VERSION;
 	gpmonPacket->pkttype = GPMON_PKTTYPE_QEXEC;
 
-	gpmon_gettmid(&gpmonPacket->u.qexec.key.tmid);
-	gpmonPacket->u.qexec.key.ssid = gp_session_id;
 	/* Better to use get_query_command_count here */
-	gpmonPacket->u.qexec.key.ccnt =  gp_command_count;
+	set_query_key(&gpmonPacket->u.qexec.key.qkey, gp_command_count);
 	gpmonPacket->u.qexec.key.hash_key.segid = GpIdentity.segindex;
 	gpmonPacket->u.qexec.key.hash_key.pid = MyProcPid;
 	return gpmonPacket;
@@ -395,9 +392,7 @@ gpmon_qlog_query_start(gpmon_packet_t *gpmonPacket, QueryDesc *qd)
 	gpmonPacket->u.qlog.tsubmit = get_query_tsubmit(qd);
 	gpmonPacket->u.qlog.tstart = tv.tv_sec;
 	set_query_tstart(tv.tv_sec, qd);
-	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
-			gpmonPacket->u.qlog.key.ssid,
-			gpmonPacket->u.qlog.key.ccnt,
+	gpmon_record_update(gpmonPacket->u.qlog.key,
 			gpmonPacket->u.qlog.status);
 	gpmon_send(gpmonPacket);
 }
@@ -418,9 +413,7 @@ gpmon_qlog_query_end(gpmon_packet_t *gpmonPacket, QueryDesc *qd, bool updateReco
 	gpmonPacket->u.qlog.tstart = get_query_tstart(qd);
 	gpmonPacket->u.qlog.tfin = tv.tv_sec;
 	if (updateRecord)
-		gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
-							gpmonPacket->u.qlog.key.ssid,
-							gpmonPacket->u.qlog.key.ccnt,
+		gpmon_record_update(gpmonPacket->u.qlog.key,
 							gpmonPacket->u.qlog.status);
 
 	gpmon_send(gpmonPacket);
@@ -443,9 +436,7 @@ gpmon_qlog_query_error(gpmon_packet_t *gpmonPacket, QueryDesc *qd)
 	gpmonPacket->u.qlog.tstart = get_query_tstart(qd);
 	gpmonPacket->u.qlog.tfin = tv.tv_sec;
 	
-	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
-			gpmonPacket->u.qlog.key.ssid,
-			gpmonPacket->u.qlog.key.ccnt,
+	gpmon_record_update(gpmonPacket->u.qlog.key,
 			gpmonPacket->u.qlog.status);
 	
 	gpmon_send(gpmonPacket);
@@ -462,10 +453,7 @@ gpmon_qlog_query_canceling(gpmon_packet_t *gpmonPacket, QueryDesc *qd)
 	gpmonPacket->u.qlog.status = GPMON_QLOG_STATUS_CANCELING;
 	gpmonPacket->u.qlog.tsubmit = get_query_tsubmit(qd);
 	gpmonPacket->u.qlog.tstart = get_query_tstart(qd);
-	
-	gpmon_record_update(gpmonPacket->u.qlog.key.tmid,
-			gpmonPacket->u.qlog.key.ssid,
-			gpmonPacket->u.qlog.key.ccnt,
+	gpmon_record_update(gpmonPacket->u.qlog.key,
 			gpmonPacket->u.qlog.status);
 	
 	gpmon_send(gpmonPacket);
@@ -590,7 +578,6 @@ init_gpmon_hooks(void)
 void
 _PG_init(void)
 {
-	time_t t;
 	if (!process_shared_preload_libraries_in_progress)
 	{
 		ereport(ERROR, (errmsg("gpmon not in shared_preload_libraries")));
@@ -603,14 +590,6 @@ _PG_init(void)
 		ereport(LOG, (errmsg("booting gpmon")));
 	}
 	init_gpmon_hooks();
-
-	t = time(NULL);
-
-	if (t == (time_t) -1)
-	{
-		elog(PANIC, "[perfmon] cannot generate global transaction id");
-	}
-	init_tmid = t;
 	gpmon_init();
 	init_pg_query_state();
 }
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index b9899955b91..cca33c92183 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -1018,9 +1018,9 @@ static void gx_recvqexec(gpmon_packet_t* pkt)
 
 	p = &pkt->u.qexec;
 	get_pid_metrics(p->key.hash_key,
-					p->key.tmid,
-					p->key.ssid,
-					p->key.ccnt);
+					p->key.qkey.tmid,
+					p->key.qkey.ssid,
+					p->key.qkey.ccnt);
 	// Store some aggregated information somewhere for metrics in
 	// queries_* tables, like cpu_elapsed, rows_out, and etc.
 	//extract_segments_exec(pkt);
@@ -1711,5 +1711,4 @@ int main(int argc, const char* const argv[])
 
 	gx_main(port, signature);
 	return 0;
-}
-
+}
\ No newline at end of file
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index a1c5ead3564..b0a4e0cb32c 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -180,10 +180,8 @@ typedef struct gpmon_qexec_hash_key_t {
  * QE will NOT need to touch anything begin with _
  */
 typedef struct gpmon_qexeckey_t {
-    int32 tmid;  /* transaction time */
-    int32 ssid; /* session id */
-    int32 ccnt;	/* command count */
-    gpmon_qexec_hash_key_t hash_key;
+	gpmon_qlogkey_t qkey;
+	gpmon_qexec_hash_key_t hash_key;
 }gpmon_qexeckey_t;
 
 struct gpmon_qexec_t {

From d4b052cdb3db4ca938acf1a48d8198e639cff5fd Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 29 Nov 2024 17:57:51 +0800
Subject: [PATCH 25/40] [perfmon] Let gpmmon not to kill gpsmon when timeout
 getting response

Gpmmon polls the query info from gpsmon each 'quantum'
time. But when this is no query running, gpsmon doesn't
response to it. Then gpmmon considers gpsmon dead
and restarts it. This will lead the whole system cannot
run correctly when busy restarting the gpsmon.

Fix it by letting the gpmmon ignore the timeout event
when waiting the respose from gpsmon.
---
 contrib/perfmon/src/gpmmon/gpmmon.c | 33 +++++++++++++++++------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
index e82d42f5968..81d3909d064 100644
--- a/contrib/perfmon/src/gpmmon/gpmmon.c
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -65,7 +65,7 @@ char* get_ip_for_host(char*, bool*);
 mmon_options_t opt = { 0 };
 
 static const apr_uint64_t smon_terminate_safe_factor = 10;
-static const apr_uint64_t recv_timeout_factor = 10;
+static const apr_uint64_t recv_timeout_factor = 30;
 
 // If smon doesn't receive any request from mmon,
 // it simply kill itself to restart.
@@ -231,6 +231,17 @@ static void SIGUSR2_handler(int sig)
 	ax.exit = 1;
 }
 
+static void
+add_recv_from_gx_event(host_t *h)
+{
+	struct timeval tv;
+	tv.tv_sec = recv_timeout_factor * gpmmon_quantum();
+	tv.tv_usec = 0;
+	if (event_add(h->event, &tv))
+	{
+		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
+	}
+}
 
 /** ------------------------------------------------------------
  After we sent a 'D'ump command, gpsmon will send us packets thru
@@ -250,18 +261,12 @@ static void recv_from_gx(SOCKET sock, short event, void* arg)
 		// no response from gpsmon for a long time
 		// retry connecting
 		TR1(("Connection to %s timeout\n",h->hostname));
-		h->eflag = 1;
-	} 
-	else if (event & EV_READ) 
+		add_recv_from_gx_event(h);
+		return;
+	}
+	else if (event & EV_READ)
 	{
-		// reset timer of timeout event
-		struct timeval tv;
-		tv.tv_sec = 10 * gpmmon_quantum();
-		tv.tv_usec = 0;
-		if (event_add(h->event, &tv))
-		{
-			gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
-		}
+		add_recv_from_gx_event(h);
 	}
 	else
 	{
@@ -1750,7 +1755,7 @@ static void getconfig(void)
  * Define GUCs
  * start gpmmon bgworker
  */
-void 
+void
 _PG_init(void)
 {
 	if (!process_shared_preload_libraries_in_progress)
@@ -1768,7 +1773,7 @@ _PG_init(void)
 	memset(&worker, 0, sizeof(BackgroundWorker));
 
 	def_gucs();
-	
+
 	/* start gpmmon only on coordinator */
 	if (!IS_QUERY_DISPATCHER())
 	{

From 984fcf5672cbc35fa99d91d38c5788bca173c084 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 29 Nov 2024 14:16:02 +0800
Subject: [PATCH 26/40] Fix getting the query text file name

The query text file name's format is
'qtmid_ssid_ccnt', but the tmid is
always 0 now. Adding the function
'get_query_text_file_name' for it.
---
 contrib/perfmon/src/common/gpmonlib.c  | 17 +++++--
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 64 +++++++++++---------------
 contrib/perfmon/src/gpmon/gpmon.c      |  2 +-
 contrib/perfmon/src/include/gpmonlib.h |  5 +-
 4 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/contrib/perfmon/src/common/gpmonlib.c b/contrib/perfmon/src/common/gpmonlib.c
index e7c7d5c0e5c..2a00cd2833e 100644
--- a/contrib/perfmon/src/common/gpmonlib.c
+++ b/contrib/perfmon/src/common/gpmonlib.c
@@ -325,14 +325,13 @@ char* gpmon_datetime_rounded(time_t t, char str[GPMON_DATE_BUF_SIZE])
 }
 
 /* get status from query text file */
-apr_int32_t get_query_status(apr_int32_t tmid, apr_int32_t ssid,
-							 apr_int32_t ccnt)
+apr_int32_t get_query_status(gpmon_qlogkey_t qkey)
 {
 	char fname[GPMON_DIR_MAX_PATH];
 	FILE *fp;
 	apr_int32_t status = GPMON_QLOG_STATUS_INVALID;
 
-	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, tmid, ssid, ccnt);
+	get_query_text_file_name(qkey, fname);
 
 	fp = fopen(fname, "r");
 	if (!fp)
@@ -354,7 +353,7 @@ apr_int32_t get_query_status(apr_int32_t tmid, apr_int32_t ssid,
 }
 
 /* get query text from query text file */
-char *get_query_text(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_pool_t *pool)
+char *get_query_text(gpmon_qlogkey_t qkey, apr_pool_t *pool)
 {
 	char meta[META_LEN] = {0};
 	signed int qrylen = 0;
@@ -362,7 +361,7 @@ char *get_query_text(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_p
 	const char *META_FMT = "%d qtext";
 	const char *META_QTEXT = "qtext\n";
 
-	snprintf(fname, GPMON_DIR_MAX_PATH, "%sq%d-%d-%d.txt", GPMON_DIR, tmid, ssid, ccnt);
+	get_query_text_file_name(qkey, fname);
 
 	FILE *fp = fopen(fname, "r");
 	if (!fp)
@@ -557,3 +556,11 @@ void merge_qlog(gpmon_qlog_t* qlog, const gpmon_qlog_t* newqlog)
 						return;
 		}
 }
+
+void
+get_query_text_file_name(gpmon_qlogkey_t key, char *fname)
+{
+	const int fname_size = 100;
+	snprintf(fname, fname_size, GPMON_DIR "q%d-%d-%d.txt", 0,
+			 key.ssid, key.ccnt);
+}
\ No newline at end of file
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 70672bb6fff..5afa4f93608 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -73,27 +73,26 @@ extern apr_queue_t* message_queue;
 extern int32 tmid;
 
 extern void incremement_tail_bytes(apr_uint64_t bytes);
-static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid,
-			apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool);
+static bool is_query_not_active(gpmon_qlogkey_t qkey, apr_hash_t *hash, apr_pool_t *pool);
 static void format_time(time_t tt, char *buf);
 static void set_tmid(gp_smon_to_mmon_packet_t* pkt, int32 tmid);
 
-static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_hash_t *hash, apr_pool_t *pool)
+static bool is_query_not_active(gpmon_qlogkey_t qkey, apr_hash_t *hash, apr_pool_t *pool)
 {
 	// get active query of session
-	char *key = apr_psprintf(pool, "%d", ssid);
+	char *key = apr_psprintf(pool, "%d", qkey.ssid);
 	char *active_query = apr_hash_get(hash, key, APR_HASH_KEY_STRING);
 	if (active_query == NULL)
 	{
-		TR0(("Found orphan query, tmid:%d, ssid:%d, ccnt:%d\n", tmid, ssid, ccnt));
+		TR0(("Found orphan query, tmid:%d, ssid:%d, ccnt:%d\n", qkey.tmid, qkey.ssid, qkey.ccnt));
 		return true;
 	}
 
 	// read query text from q file
-	char *query = get_query_text(tmid, ssid, ccnt, pool);
+	char *query = get_query_text(qkey, pool);
 	if (query == NULL)
 	{
-		TR0(("Found error while reading query text in file '%sq%d-%d-%d.txt'\n", GPMON_DIR, tmid, ssid, ccnt));
+		TR0(("Found error while reading query text in file '%sq%d-%d-%d.txt'\n", GPMON_DIR, qkey.tmid, qkey.ssid, qkey.ccnt));
 		return true;
 	}
 	// if the current active query of session (ssid) is not the same
@@ -108,7 +107,7 @@ static bool is_query_not_active(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t
 	int res = strncmp(query, active_query, qlen);
 	if (res != 0)
 	{
-		TR0(("Found orphan query, tmid:%d, ssid:%d, ccnt:%d\n", tmid, ssid, ccnt));
+		TR0(("Found orphan query, tmid:%d, ssid:%d, ccnt:%d\n", qkey.tmid, qkey.ssid, qkey.ccnt));
 		return true;
 	}
 
@@ -325,8 +324,6 @@ apr_status_t agg_create(agg_t** retagg, apr_int64_t generation, apr_pool_t* pare
 	return 0;
 }
 
-
-
 apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr_hash_t* fsinfotab)
 {
 	int e, cnt;
@@ -370,7 +367,7 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 		if (age > 0)
 		{
 			if (((age % 5 == 0) /* don't call is_query_not_active every time because it's expensive */
-			       && is_query_not_active(dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt, active_query_tab, newagg->pool)))
+			       && is_query_not_active(dp->qlog.key, active_query_tab, newagg->pool)))
 			{
 				if (dp->qlog.dbid != gpperfmon_dbid)
 				{
@@ -570,10 +567,7 @@ apr_status_t agg_dump(agg_t* agg)
 		{
 			const int fname_size = sizeof(GPMON_DIR) + 100;
 			char fname[fname_size];
-			snprintf(fname, fname_size, GPMON_DIR "q%d-%d-%d.txt",
-			qdnode->qlog.key.tmid, qdnode->qlog.key.ssid,
-			qdnode->qlog.key.ccnt);
-
+			get_query_text_file_name(qdnode->qlog.key, fname);
 			bloom_set(&bloom, fname);
 		}
 
@@ -611,6 +605,7 @@ static void delete_old_files(bloom_t* bloom)
 	char findCmd[512] = {0};
 	FILE* fp = NULL;
 	time_t cutoff = time(0) - gpmmon_quantum() * 3;
+	cutoff = cutoff < 10 ? 10 : cutoff;
 
 	/* Need to remove trailing / in dir so find results are consistent
      * between platforms
@@ -650,14 +645,14 @@ static void delete_old_files(bloom_t* bloom)
 				TR2(("File %s expired: %d\n", p, expired));
 				if (expired)
 				{
-					apr_int32_t tmid = 0, ssid = 0, ccnt = 0;
+					gpmon_qlogkey_t qkey = {0};
 					if (bloom_isset(bloom, p))
 					{
 						TR2(("File %s has bloom set.  Checking status\n", p));
 						/* Verify no bloom collision */
-						sscanf(p, GPMON_DIR "q%d-%d-%d.txt", &tmid, &ssid, &ccnt);
-						TR2(("tmid: %d, ssid: %d, ccnt: %d\n", tmid, ssid, ccnt));
-						status = get_query_status(tmid, ssid, ccnt);
+						sscanf(p, GPMON_DIR "q%d-%d-%d.txt", &qkey.tmid, &qkey.ssid, &qkey.ccnt);
+						TR2(("tmid: %d, ssid: %d, ccnt: %d\n", qkey.tmid, qkey.ssid, qkey.ccnt));
+						status = get_query_status(qkey);
 						TR2(("File %s has status of %d\n", p, status));
 						if (status == GPMON_QLOG_STATUS_DONE ||
 						   status == GPMON_QLOG_STATUS_ERROR)
@@ -1264,28 +1259,25 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
 
         if (qdnode->num_metrics_packets)
         {
-                // average cpu_pct per reporting machine
-                cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
-                fd_cnt = qdnode->qlog.p_metrics.fd_cnt / qdnode->num_metrics_packets;
-                cpu_skew = qdnode->qlog.p_metrics.cpu_skew / qdnode->num_metrics_packets;
-        }
-        else
-        {
-                cpu_current = 0.0f;
-                fd_cnt  = 0;
-                cpu_skew = 0.0f;
-        }
-
+			// average cpu_pct per reporting machine
+			cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
+			fd_cnt = qdnode->qlog.p_metrics.fd_cnt / qdnode->num_metrics_packets;
+			cpu_skew = qdnode->qlog.p_metrics.cpu_skew / qdnode->num_metrics_packets;
+		}
+		else
+		{
+			cpu_current = 0.0f;
+			fd_cnt = 0;
+			cpu_skew = 0.0f;
+		}
 
-        // get query text、plan
-        char* array[5] = {"", "", "", "", ""};
+		// get query text and plan
+		char* array[5] = {"", "", "", "", ""};
         const int qfname_size = 256;
         char qfname[qfname_size];
         int size = 0;
         FILE* qfptr = 0;
-		snprintf(qfname, qfname_size, GPMON_DIR "q%d-%d-%d.txt", 0,
-				 qdnode->qlog.key.ssid,
-				 qdnode->qlog.key.ccnt);
+		get_query_text_file_name(qdnode->qlog.key, qfname);
 		qfptr = fopen(qfname, "r");
         if (qfptr)
         {
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 2a46e37c85a..45a7bfc9e18 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -329,7 +329,7 @@ static const char* gpmon_null_subst(const char* input)
  * necessary because gpmon overwrites the last byte to indicate status.
  *
  * Have tested the speed of this function on local machine
- * - each file is 0B, 1000 files, tabke about 50ms
+ * - each file is 0B, 1000 files, take about 50ms
  * - each file is 102B, 1000 files, take about 70ms
  * - each file is 57K, 1000 files, take about 240ms
  */
diff --git a/contrib/perfmon/src/include/gpmonlib.h b/contrib/perfmon/src/include/gpmonlib.h
index 5552970a39c..7d35557fac6 100644
--- a/contrib/perfmon/src/include/gpmonlib.h
+++ b/contrib/perfmon/src/include/gpmonlib.h
@@ -78,8 +78,8 @@ extern char* gpmon_datetime(time_t t, char str[GPMON_DATE_BUF_SIZE]);
 extern char* gpmon_datetime_rounded(time_t t, char str[GPMON_DATE_BUF_SIZE]);
 
 /* utility */
-extern apr_int32_t get_query_status(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt);
-extern char *get_query_text(apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt, apr_pool_t *pool);
+extern apr_int32_t get_query_status(gpmon_qlogkey_t qkey);
+extern char *get_query_text(gpmon_qlogkey_t qkey, apr_pool_t *pool);
 
 #define DEFAULT_PATH_TO_HADOOP_HOST_FILE "/etc/gphd/gphdmgr/conf/clusterinfo.txt"
 #define PATH_TO_HADOOP_SMON_LOGS "/var/log/gphd/smon"
@@ -244,4 +244,5 @@ extern void gp_smon_to_mmon_set_header(gp_smon_to_mmon_packet_t* pkt, apr_int16_
 apr_status_t apr_pool_create_alloc(apr_pool_t ** newpool, apr_pool_t *parent);
 void gpdb_get_single_string_from_query(const char* QUERY, char** resultstring, apr_pool_t* pool);
 void merge_qlog(gpmon_qlog_t* qlog, const gpmon_qlog_t* newqlog);
+extern void get_query_text_file_name(gpmon_qlogkey_t key, char *fname);
 #endif /* GPMONLIB_H */

From bab80a967b77385f877fa25ac9dde01a92fdd7e0 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Tue, 10 Dec 2024 14:03:34 +0800
Subject: [PATCH 27/40] [perfmon] fix gpmon_datetime_rounded

tip: don't need to cherry-pick it to cloud
---
 contrib/perfmon/src/common/gpmonlib.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/perfmon/src/common/gpmonlib.c b/contrib/perfmon/src/common/gpmonlib.c
index 2a00cd2833e..e40e634260f 100644
--- a/contrib/perfmon/src/common/gpmonlib.c
+++ b/contrib/perfmon/src/common/gpmonlib.c
@@ -319,6 +319,7 @@ char* gpmon_datetime_rounded(time_t t, char str[GPMON_DATE_BUF_SIZE])
 		return str;
 	}
 
+	tm.tm_sec = (tm.tm_sec /5) * 5;
 	strftime(str, GPMON_DATE_BUF_SIZE - 1, "%Y-%m-%d %H:%M:%S%z", &tm);
 
 	return str;
@@ -563,4 +564,4 @@ get_query_text_file_name(gpmon_qlogkey_t key, char *fname)
 	const int fname_size = 100;
 	snprintf(fname, fname_size, GPMON_DIR "q%d-%d-%d.txt", 0,
 			 key.ssid, key.ccnt);
-}
\ No newline at end of file
+}

From 1e3b425df46cdf79b8d402e55753014ce81bf521 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 20 Dec 2024 10:43:44 +0800
Subject: [PATCH 28/40] perfmon: pg_query_state using pid to record proc info

Using PGPROC to track the backend info in pg_query_state will lead
memory problems as QE and QD not on the same host
---
 contrib/perfmon/src/gpmon/pg_query_state.c | 8 ++++----
 contrib/perfmon/src/gpmon/pg_query_state.h | 3 ++-
 contrib/perfmon/src/gpmon/signal_handler.c | 8 ++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/contrib/perfmon/src/gpmon/pg_query_state.c b/contrib/perfmon/src/gpmon/pg_query_state.c
index 9db159805d4..b661209bcd5 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.c
+++ b/contrib/perfmon/src/gpmon/pg_query_state.c
@@ -527,7 +527,7 @@ pg_query_state(PG_FUNCTION_ARGS)
 {
 	typedef struct
 	{
-		PGPROC 		*proc;
+		int 		pid;
 		ListCell 	*frame_cursor;
 		int			 frame_index;
 		List		*stack;
@@ -684,7 +684,7 @@ pg_query_state(PG_FUNCTION_ARGS)
 						qs_stack = deserialize_stack(current_msg->stack,
 													 current_msg->stack_depth);
 
-						p_state->proc = current_msg->proc;
+						p_state->pid = current_msg->pid;
 						p_state->stack = qs_stack;
 						p_state->frame_index = 0;
 						p_state->frame_cursor = list_head(qs_stack);
@@ -733,11 +733,11 @@ pg_query_state(PG_FUNCTION_ARGS)
 		/* Make and return next tuple to caller */
 		MemSet(values, 0, sizeof(values));
 		MemSet(nulls, 0, sizeof(nulls));
-		values[0] = Int32GetDatum(p_state->proc->pid);
+		values[0] = Int32GetDatum(p_state->pid);
 		values[1] = Int32GetDatum(p_state->frame_index);
 		values[2] = PointerGetDatum(frame->query);
 		values[3] = PointerGetDatum(frame->plan);
-		if (p_state->proc->pid == pid)
+		if (p_state->pid == pid)
 			nulls[4] = true;
 		else
 			values[4] = Int32GetDatum(pid);
diff --git a/contrib/perfmon/src/gpmon/pg_query_state.h b/contrib/perfmon/src/gpmon/pg_query_state.h
index e73f6a38a0f..b93be738937 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.h
+++ b/contrib/perfmon/src/gpmon/pg_query_state.h
@@ -57,7 +57,7 @@ typedef struct
 {
 	int     reqid;
 	int		length;							/* size of message record, for sanity check */
-	PGPROC	*proc;
+	int 	pid;
 	PG_QS_RequestResult	result_code;
 	int warnings;
 	int		stack_depth;
@@ -91,6 +91,7 @@ typedef struct
 	int     reqid;
 	int		length;							/* size of message record, for sanity check */
 	PGPROC	*proc;
+
 	PG_QS_RequestResult	result_code;
 	int 	sliceIndex;
 	uint64 	queryId;
diff --git a/contrib/perfmon/src/gpmon/signal_handler.c b/contrib/perfmon/src/gpmon/signal_handler.c
index a539153757a..2ea498eeef1 100644
--- a/contrib/perfmon/src/gpmon/signal_handler.c
+++ b/contrib/perfmon/src/gpmon/signal_handler.c
@@ -443,7 +443,7 @@ QD_SendQueryState(shm_mq_handle  *mqh, PGPROC *proc)
 
 	msg->reqid = params->reqid;
 	msg->length = msglen;
-	msg->proc = MyProc;
+	msg->pid = MyProc->pid;
 	msg->result_code = QS_RETURNED;
 
 	msg->warnings = 0;
@@ -598,7 +598,7 @@ send_cdbComponents_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
 	{
 		res = false;
 		if (msg != NULL)
-			*msg = (shm_mq_msg){reqid, BASE_SIZEOF_SHM_MQ_MSG, MyProc, QUERY_NOT_RUNNING};
+			*msg = (shm_mq_msg){reqid, BASE_SIZEOF_SHM_MQ_MSG, MyProc->pid, QUERY_NOT_RUNNING};
 	}
 	return res;
 
@@ -608,7 +608,7 @@ static  void
 set_msg(shm_mq_msg *msg, int reqid, PG_QS_RequestResult res)
 {
 	if (msg != NULL)
-		*msg = (shm_mq_msg){reqid, BASE_SIZEOF_SHM_MQ_MSG, MyProc, res};
+		*msg = (shm_mq_msg){reqid, BASE_SIZEOF_SHM_MQ_MSG, MyProc->pid, res};
 }
 static bool
 query_state_pre_check(shm_mq_handle *mqh, int reqid, shm_mq_msg *msg)
@@ -686,7 +686,7 @@ receive_QE_query_state(shm_mq_handle *mqh, List **query_state_info_list)
 			return false;
 		}
 		*query_state_info_list = lappend(*query_state_info_list, seg_query_state_info);
-		elog(DEBUG1, "receive QE query state slice %d, proc %d successfully", seg_query_state_info->sliceIndex, seg_query_state_info->proc->backendId);
+		elog(DEBUG1, "receive QE query state slice %d", seg_query_state_info->sliceIndex);
 	}
 	return true;
 }

From 27f6273685d5dbccd5530c8c2f8a393b850120bb Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Thu, 26 Dec 2024 18:18:47 +0800
Subject: [PATCH 29/40] [perfmon] Fix gpmon_catqrynow.py

tmid is generated by gpmmon now, qe or qd doesn't konw it,
so the query text file use 0 as tmid to generate filename
---
 contrib/perfmon/gpmon_catqrynow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/perfmon/gpmon_catqrynow.py b/contrib/perfmon/gpmon_catqrynow.py
index e0cfdc7a9ca..eead2311cfa 100644
--- a/contrib/perfmon/gpmon_catqrynow.py
+++ b/contrib/perfmon/gpmon_catqrynow.py
@@ -18,7 +18,7 @@
     priority = ''
     fp = None
     try:
-        fp = open(os.path.join(GPMONDIR, "q%s-%s-%s.txt" % (tmid, xid, cid)), 'r')
+        fp = open(os.path.join(GPMONDIR, "q%s-%s-%s.txt" % (0, xid, cid)), 'r')
         meta = fp.readline().split(' ')
         qrytxt = fp.read(int(meta[0])).strip()
 

From dee568d80997c98924bfd0206f250f011d0e22fb Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Thu, 26 Dec 2024 14:00:39 +0800
Subject: [PATCH 30/40] [perfmon] add test for query text in multiple lines

---
 contrib/perfmon/expected/query.out | 17 +++++++++++------
 contrib/perfmon/sql/query.sql      |  3 +++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index 28a8ddc0d1a..583370dd7f3 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -15,10 +15,13 @@ NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 INSERT INTO foo SELECT generate_series(0,10);
 INSERT INTO test SELECT generate_series(0,10);
+-- test query text in multiple lines
+INSERT INTO test
+SELECT generate_series(0,10);
 select count(*) from foo,test where foo.a=test.a;
  count 
 -------
-    11
+    22
 (1 row)
 
 -- test nested query
@@ -30,7 +33,7 @@ $$ language plpgsql;
 select * from n_join_foo_test();
  n_join_foo_test 
 -----------------
-              11
+              22
 (1 row)
 
 DROP TABLE foo;
@@ -92,14 +95,16 @@ where ssid = :sess_id order by ccnt;
     4 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
     8 | done   | INSERT INTO foo SELECT generate_series(0,10);                    | t
    10 | done   | INSERT INTO test SELECT generate_series(0,10);                   | t
-   12 | done   | select count(*) from foo,test where foo.a=test.a;                | t
-   15 | done   | select * from n_join_foo_test();                                 | t
-(6 rows)
+   12 | done   | INSERT INTO test                                                +| t
+      |        | SELECT generate_series(0,10);                                    | 
+   14 | done   | select count(*) from foo,test where foo.a=test.a;                | t
+   17 | done   | select * from n_join_foo_test();                                 | t
+(7 rows)
 
 SELECT COUNT(*) FROM (SELECT DISTINCT ccnt FROM queries_history
 where ssid = :sess_id) as temp;
  count 
 -------
-     6
+     7
 (1 row)
 
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index 9195e66d0e2..64de66d9dcf 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -30,6 +30,9 @@ CREATE TABLE foo(a int);
 CREATE TABLE test(a int);
 INSERT INTO foo SELECT generate_series(0,10);
 INSERT INTO test SELECT generate_series(0,10);
+-- test query text in multiple lines
+INSERT INTO test
+SELECT generate_series(0,10);
 select count(*) from foo,test where foo.a=test.a;
 -- test nested query
 create or replace function n_join_foo_test() returns integer as $$

From 09cb20ff7532fa551a22cbb03282ad1f9b3edf53 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Fri, 3 Jan 2025 16:40:05 +0800
Subject: [PATCH 31/40] perfmon: Fix pg_query_state receives QE query state
 failed

As the PART_RCV_DELAY (100ms) is too less than the WRITING_DELAY (1s),
when the mq is full, receiver will timeout and report error before
the writer waking up.

As the plan can be very large, even when there are many slices in a
plan, so just increase the size of mq in shared memory.
---
 contrib/perfmon/src/gpmon/pg_query_state.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/contrib/perfmon/src/gpmon/pg_query_state.h b/contrib/perfmon/src/gpmon/pg_query_state.h
index b93be738937..061a41f02d5 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.h
+++ b/contrib/perfmon/src/gpmon/pg_query_state.h
@@ -18,8 +18,8 @@
 #include "storage/shm_mq.h"
 
 
-#define	QUEUE_SIZE			(16 * 1024)
-#define MSG_MAX_SIZE		1024
+#define	QUEUE_SIZE			(64 * 1024)
+#define MSG_MAX_SIZE		(4 * 1024)
 #define WRITING_DELAY		(100 * 1000) /* 100ms */
 #define NUM_OF_ATTEMPTS		6
 
@@ -31,14 +31,14 @@
 #define	PG_QS_SND_KEY       1
 
 /* Receive timeout should be larger than send timeout to let workers stop waiting before polling process */
-#define MAX_RCV_TIMEOUT   6000 /* 6 seconds */
-#define MAX_SND_TIMEOUT   3000 /* 3 seconds */
+#define MAX_RCV_TIMEOUT   2000 /* 2 seconds */
+#define MAX_SND_TIMEOUT   1000 /* 1 seconds */
 
 /*
  * Delay for receiving parts of full message (in case SHM_MQ_WOULD_BLOCK code),
  * should be tess than MAX_RCV_TIMEOUT
  */
-#define PART_RCV_DELAY    1000 /* 1 second */
+#define PART_RCV_DELAY    100 /* 100 ms */
 
 /*
  * Result status on query state request from asked backend

From 622d20d450deda13d771b54a0602b3a6de8b4f71 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Tue, 7 Jan 2025 17:24:13 +0800
Subject: [PATCH 32/40] perfmon: fix empty query_text and improve perfomance

When flush queries to queries_now or queries_tail, for each query
record, function 'gpdb_get_spill_file_size_from_query' is called.
Found that function call costs a lot of time (the perf metrics
tell that). And that makes the agg_dump cannot call each 'quantum',
then the query records cannot be flushed to the tables in time.
But the query text file will be deleted if the query has been
finished for 3 * quantum time, so will see query_text is empty
in lots of query records.

Replace 'gpdb_get_spill_file_size_from_query' with
'gpdb_get_spill_file_size' , which is only called once for
each 'agg_dump', which can improve perfomance and solve above
issues.
---
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 20 ++++++--
 contrib/perfmon/src/gpmmon/gpmondb.c   | 69 ++++++++++++--------------
 contrib/perfmon/src/gpmmon/gpmondb.h   |  2 +-
 contrib/perfmon/src/include/gpmon.h    |  2 +-
 4 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 5afa4f93608..2fd332720f2 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -466,6 +466,7 @@ apr_status_t agg_dump(agg_t* agg)
 	char nowstr[GPMON_DATE_BUF_SIZE];
 	FILE* fp_queries_now = 0;
 	FILE* fp_queries_tail = 0;
+	apr_hash_t *spill_file_tab = NULL;
 
 	dbmetrics_t dbmetrics = {0};
 
@@ -489,6 +490,8 @@ apr_status_t agg_dump(agg_t* agg)
 	bloom_set(&bloom, GPMON_DIR "diskspace_tail.dat");
 	bloom_set(&bloom, GPMON_DIR "diskspace_stage.dat");
 	bloom_set(&bloom, GPMON_DIR "_diskspace_tail.dat");
+	// get spill file size
+	spill_file_tab = gpdb_get_spill_file_size(agg->pool);
 
 	/* dump metrics */
 	temp_bytes_written = write_system(agg, nowstr);
@@ -512,6 +515,15 @@ apr_status_t agg_dump(agg_t* agg)
 		qdnode_t* qdnode;
 		apr_hash_this(hi, 0, 0, &vptr);
 		qdnode = vptr;
+		if (spill_file_tab != NULL)
+		{
+			char *key = apr_psprintf(agg->pool, "%d-%d", qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt);
+			long *spill_file_size = apr_hash_get(spill_file_tab, key, APR_HASH_KEY_STRING);
+			if (spill_file_size)
+			{
+				qdnode->qlog.p_metrics.spill_files_size = *spill_file_size;
+			}
+		}
 
 		if (qdnode->qlog.status == GPMON_QLOG_STATUS_DONE || qdnode->qlog.status == GPMON_QLOG_STATUS_ERROR)
 		{
@@ -1018,8 +1030,6 @@ static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const ch
 	qdnode->qlog.p_metrics.cpu_skew += cpu_skew;
 	//row_skew = get_row_skew(qdnode);
 	//rowsout = get_rowsout(qdnode);
-        // get spill file size
-        gpdb_get_spill_file_size_from_query(qdnode);
 
 	if (qdnode->qlog.tsubmit)
 	{
@@ -1251,8 +1261,6 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
         cpu_skew = get_cpu_skew(qdnode);
         qdnode->qlog.p_metrics.cpu_skew += cpu_skew;
 
-        // get spill file size
-        gpdb_get_spill_file_size_from_query(qdnode);
 		format_time(qdnode->qlog.tsubmit, timsubmitted);
 		format_time(qdnode->qlog.tstart, timstarted);
 		format_time(qdnode->qlog.tfin, timfinished);
@@ -1290,6 +1298,10 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
                 array[0] = replaceQuotes(array[0], pool, &size);
                 array[1] = replaceQuotes(array[1], pool, &size);
         }
+		else
+		{
+			gpmon_warning(FLINE, "missing expected qyuery file: %s", qfname);
+		}
 
         int line_size = (1024+size)*sizeof(char);
         char* line = apr_palloc(pool,line_size);
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.c b/contrib/perfmon/src/gpmmon/gpmondb.c
index e1c7348894c..5c06cd75873 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.c
+++ b/contrib/perfmon/src/gpmmon/gpmondb.c
@@ -718,49 +718,44 @@ void gpdb_get_single_string_from_query(const char* QUERY, char** resultstring, a
 	*resultstring = tmpoutput;
 }
 
-void gpdb_get_spill_file_size_from_query(qdnode_t *qdnode)
+apr_hash_t *gpdb_get_spill_file_size(apr_pool_t *pool)
 {
-        char query[200];
-        snprintf(query, sizeof(query), "select sum(size) from gp_toolkit.gp_workfile_usage_per_query where sess_id=%d And command_cnt=%d;",
-                 qdnode->qlog.key.ssid,qdnode->qlog.key.ccnt);
-
-        PGconn* conn = 0;
-        PGresult* result = 0;
-        char* tmpoutput = 0;
-        int rowcount;
-        const char* errmsg = gpdb_exec(&conn, &result, query);
-        if (errmsg)
+    apr_hash_t *spill_file_tab = NULL;
+    PGconn *conn = 0;
+    PGresult *result = 0;
+    const char *query = "select sess_id, command_cnt, sum(size) from gp_toolkit.gp_workfile_usage_per_query group by (sess_id, command_cnt)";
+    int rowcount = 0;
+
+    const char *errmsg = gpdb_exec(&conn, &result, query);
+    if (errmsg)
+    {
+        gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, query);
+    }
+    else
+    {
+        rowcount = PQntuples(result);
+    }
+    spill_file_tab = apr_hash_make(pool);
+    if (!spill_file_tab)
+        gpmon_warning(FLINE, "Out of memory");
+    else
+    {
+        for (int i = 0; i < rowcount; i ++)
         {
-            gpmon_warning(FLINE, "GPDB error %s\n\tquery: %s\n", errmsg, query);
+            char *sessid = PQgetvalue(result, i, 0);
+            char *command_cnt = PQgetvalue(result, i, 1);
+            long *size = apr_pcalloc(pool, sizeof(long));
+            *size = atol(PQgetvalue(result, i, 2));
+            char *key = apr_psprintf(pool, "%s-%s", sessid, command_cnt);
+            apr_hash_set(spill_file_tab, key, APR_HASH_KEY_STRING, size);
         }
-        else
-        {
-            rowcount = PQntuples(result);
-            if (rowcount == 1)
-            {
-                tmpoutput = PQgetvalue(result, 0, 0);
-            }
-            else if (rowcount > 1)
-            {
-                gpmon_warning(FLINE, "unexpected number of rows returned from query %s", query);
-            }
-        }
-
-        PQclear(result);
-        PQfinish(conn);
+    }
 
-        if (tmpoutput)
-        {
-                uint64_t temp_result = 0;
-                sscanf(tmpoutput, "%lu", &temp_result);
-                if (temp_result > 0 && temp_result > qdnode->qlog.p_metrics.spill_files_size)
-                {
-                        qdnode->qlog.p_metrics.spill_files_size = temp_result;
-                }
-        }
+    PQclear(result);
+    PQfinish(conn);
+    return spill_file_tab;
 }
 
-
 static void check_and_add_partition(PGconn* conn, const char* tbl, int begin_year, int begin_month, int end_year, int end_month)
 {
 	PGresult* result = 0;
diff --git a/contrib/perfmon/src/gpmmon/gpmondb.h b/contrib/perfmon/src/gpmmon/gpmondb.h
index f05a48e5d7b..cbb32c70ead 100644
--- a/contrib/perfmon/src/gpmmon/gpmondb.h
+++ b/contrib/perfmon/src/gpmmon/gpmondb.h
@@ -92,7 +92,7 @@ APR_DECLARE (void) create_log_alert_table(void);
 int find_token_in_config_string(char*, char**, const char*);
 void process_line_in_hadoop_cluster_info(apr_pool_t*, apr_hash_t*, char*, char*, char*);
 int get_hadoop_hosts_and_add_to_hosts(apr_pool_t*, apr_hash_t*, mmon_options_t*);
-extern void gpdb_get_spill_file_size_from_query(qdnode_t* qdnode);
+extern apr_hash_t *gpdb_get_spill_file_size(apr_pool_t * pool);
 apr_status_t truncate_file(char*, apr_pool_t*);
 
 #endif /* GPMONDB_H */
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index b0a4e0cb32c..3fd1b364a76 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -135,7 +135,7 @@ struct gpmon_proc_metrics_t {
     uint32 fd_cnt;		/* # opened files / sockets etc */
     float        cpu_pct;	/* cpu usage % */
     double       cpu_skew;
-    uint64       spill_files_size;
+    long       spill_files_size;
     struct {
 		uint64 size, resident, share;
     } mem;

From 33d7cc4ce19d3c822b34673dd9375b5065756b11 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Thu, 9 Jan 2025 18:09:01 +0800
Subject: [PATCH 33/40] perfmon: Fix instrument check in pg_query_state

---
 contrib/perfmon/src/gpmon/pg_query_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/perfmon/src/gpmon/pg_query_state.c b/contrib/perfmon/src/gpmon/pg_query_state.c
index b661209bcd5..ba7235a4b8b 100644
--- a/contrib/perfmon/src/gpmon/pg_query_state.c
+++ b/contrib/perfmon/src/gpmon/pg_query_state.c
@@ -1431,7 +1431,7 @@ qs_ExecutorEnd(QueryDesc *queryDesc)
 	PG_TRY();
 	{
 		if (Gp_role == GP_ROLE_EXECUTE && enable_qs_runtime() &&
-			(queryDesc->instrument_options | INSTRUMENT_ROWS) &&
+			(queryDesc->instrument_options & INSTRUMENT_ROWS) &&
 			queryDesc->planstate->instrument)
 		{
 			StringInfo strInfo = cdbexplain_getExecStats_runtime(queryDesc);

From e410a7750ba8496ce43b21204c07900f801c3e47 Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Wed, 15 Jan 2025 09:49:19 +0800
Subject: [PATCH 34/40] [perfmon] Refactor metrics collection

Refactor metrics collection to distinguish between historical and real-time ones.
- For historical queries, take the maximum value of memory and spill_file_size.
- For real-time queries, take the average value of memory and spill_file_size.
- For cpu_skew, use the formula in the Greenplum official documentation: cpu_skew = 1 - (cpu_avg / max_seg_cpu_sum).
- For real-time cpu, take the avg of a interval, and the avg cpu of the whole query lifetime saved in queries_history
---
 contrib/perfmon/expected/query.out     |  53 +++-
 contrib/perfmon/sql/query.sql          |   7 +-
 contrib/perfmon/src/gpmmon/gpmmon.c    |   8 +-
 contrib/perfmon/src/gpmmon/gpmon_agg.c | 378 ++++++++++++-------------
 contrib/perfmon/src/gpmmon/gpmon_agg.h |  21 +-
 contrib/perfmon/src/gpmon/gpmon.c      |   1 -
 contrib/perfmon/src/gpsmon/gpsmon.c    | 210 +++++++-------
 contrib/perfmon/src/include/gpmon.h    |   5 +
 contrib/perfmon/src/include/gpmonlib.h |   3 +-
 9 files changed, 370 insertions(+), 316 deletions(-)

diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index 583370dd7f3..d04e081561d 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -1,8 +1,36 @@
 -- start_ignore
+-- wait a while as sometimes the gpmmon is not ready
+\c gpperfmon
+CREATE OR REPLACE FUNCTION wait_for_gpmmon_work() RETURNS void AS $$
+DECLARE
+DECLARE
+start_time timestamptz := clock_timestamp();
+updated bool;
+BEGIN
+	-- we don't want to wait forever; loop will exit after 60 seconds
+	FOR i IN 1 .. 1000 LOOP
+		SELECT(SELECT count(*) > 0 from queries_history ) INTO updated;
+		EXIT WHEN updated;
+
+		-- wait a little
+		PERFORM pg_sleep_for('100 milliseconds');
+	END LOOP;
+	-- report time waited in postmaster log (where it won't change test output)
+	RAISE log 'wait_for_gpmmon_work delayed % seconds',
+	EXTRACT(epoch FROM clock_timestamp() - start_time);
+END
+$$ LANGUAGE plpgsql;
+select wait_for_gpmmon_work();
+ wait_for_gpmmon_work 
+----------------------
+ 
+(1 row)
+
+\c contrib_regression
 select sess_id from pg_stat_activity where pg_backend_pid()=pid;
  sess_id 
 ---------
-      26
+    7316
 (1 row)
 
 \gset
@@ -13,15 +41,15 @@ HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur
 CREATE TABLE test(a int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-INSERT INTO foo SELECT generate_series(0,10);
-INSERT INTO test SELECT generate_series(0,10);
+INSERT INTO foo SELECT generate_series(0,30000000);
+INSERT INTO test SELECT generate_series(0,30000000);
 -- test query text in multiple lines
 INSERT INTO test
 SELECT generate_series(0,10);
 select count(*) from foo,test where foo.a=test.a;
- count 
--------
-    22
+  count   
+----------
+ 30000012
 (1 row)
 
 -- test nested query
@@ -33,7 +61,7 @@ $$ language plpgsql;
 select * from n_join_foo_test();
  n_join_foo_test 
 -----------------
-              22
+        30000012
 (1 row)
 
 DROP TABLE foo;
@@ -93,8 +121,8 @@ where ssid = :sess_id order by ccnt;
 ------+--------+------------------------------------------------------------------+----------
     2 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
     4 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
-    8 | done   | INSERT INTO foo SELECT generate_series(0,10);                    | t
-   10 | done   | INSERT INTO test SELECT generate_series(0,10);                   | t
+    8 | done   | INSERT INTO foo SELECT generate_series(0,30000000);              | t
+   10 | done   | INSERT INTO test SELECT generate_series(0,30000000);             | t
    12 | done   | INSERT INTO test                                                +| t
       |        | SELECT generate_series(0,10);                                    | 
    14 | done   | select count(*) from foo,test where foo.a=test.a;                | t
@@ -108,3 +136,10 @@ where ssid = :sess_id) as temp;
      7
 (1 row)
 
+select mem_peak>0, cpu_currpct>0, spill_file_size>0, skew_cpu>0, status, query_text, length(query_plan) > 0 from queries_history
+where ssid = :sess_id and query_text = 'select count(*) from foo,test where foo.a=test.a;'
+ ?column? | ?column? | ?column? | ?column? | status |                    query_text                     | ?column? 
+----------+----------+----------+----------+--------+---------------------------------------------------+----------
+ t        | t        | t        | t        | done   | select count(*) from foo,test where foo.a=test.a; | t
+(1 row)
+
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index 64de66d9dcf..f788f3c76ae 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -28,8 +28,8 @@ select sess_id from pg_stat_activity where pg_backend_pid()=pid;
 
 CREATE TABLE foo(a int);
 CREATE TABLE test(a int);
-INSERT INTO foo SELECT generate_series(0,10);
-INSERT INTO test SELECT generate_series(0,10);
+INSERT INTO foo SELECT generate_series(0,30000000);
+INSERT INTO test SELECT generate_series(0,30000000);
 -- test query text in multiple lines
 INSERT INTO test
 SELECT generate_series(0,10);
@@ -65,3 +65,6 @@ where ssid = :sess_id order by ccnt;
 
 SELECT COUNT(*) FROM (SELECT DISTINCT ccnt FROM queries_history
 where ssid = :sess_id) as temp;
+
+select mem_peak>0, cpu_currpct>0, spill_file_size>0, skew_cpu>0, status, query_text, length(query_plan) > 0 from queries_history
+where ssid = :sess_id and query_text = 'select count(*) from foo,test where foo.a=test.a;'
diff --git a/contrib/perfmon/src/gpmmon/gpmmon.c b/contrib/perfmon/src/gpmmon/gpmmon.c
index 81d3909d064..f702a6b218a 100644
--- a/contrib/perfmon/src/gpmmon/gpmmon.c
+++ b/contrib/perfmon/src/gpmmon/gpmmon.c
@@ -49,6 +49,7 @@ int perfmon_port = 8888;
 bool perfmon_enabled = false;
 //bool perfmon_enable_query_metric;
 
+
 void update_mmonlog_filename(void);
 int gpmmon_quantum(void);
 void incremement_tail_bytes(apr_uint64_t);
@@ -254,7 +255,8 @@ static void recv_from_gx(SOCKET sock, short event, void* arg)
 	int e;
 	gp_smon_to_mmon_packet_t pktbuf;
 	gp_smon_to_mmon_packet_t* pkt = 0;
-	TR2(("recv_from_gx sock %d host %s port %d\n", sock, h->hostname, ax.port));
+        char* host_ip = get_connection_ip(h);
+	TR2(("recv_from_gx sock %d host %s ip %s port %d\n", sock, h->hostname, host_ip, ax.port));
 
 	if (event & EV_TIMEOUT)
 	{
@@ -297,7 +299,7 @@ static void recv_from_gx(SOCKET sock, short event, void* arg)
 		else
 		{
 			pkt = &pktbuf;
-			TR2(("received packet %d from %s:%d\n", pkt->header.pkttype, h->hostname, ax.port));
+			TR2(("received packet %d from %s:%s:%d\n", pkt->header.pkttype, h->hostname, host_ip, ax.port));
 		}
 	}
 
@@ -305,6 +307,7 @@ static void recv_from_gx(SOCKET sock, short event, void* arg)
 	if (pkt)
 	{
 		apr_thread_mutex_lock(ax.agg_mutex);
+                pkt->ipaddr = host_ip;
 		e = agg_put(ax.agg, pkt);
 		apr_thread_mutex_unlock(ax.agg_mutex);
 		if (e)
@@ -1554,7 +1557,6 @@ static void gethostlist()
 
 	/* Connect to database, get segment hosts from gp_segment_configuration */
 	gpdb_get_hostlist(&ax.hosttabsz, &ax.hosttab, ax.pool, &opt);
-
 	for (i = 0; i < ax.hosttabsz; ++i)
 	{
 		addressinfo_holder_t* addressinfo;
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.c b/contrib/perfmon/src/gpmmon/gpmon_agg.c
index 2fd332720f2..a7e4cc48f33 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.c
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.c
@@ -45,9 +45,7 @@ typedef struct mmon_qexec_t
 typedef struct mmon_query_seginfo_t
 {
 	gpmon_query_seginfo_key_t	key;
-	apr_int64_t					final_rowsout;
 	apr_uint64_t				sum_cpu_elapsed;
-	apr_uint64_t				sum_measures_rows_out;
 } mmon_query_seginfo_t;  //The agg value at segment level for query
 
 struct agg_t
@@ -76,7 +74,8 @@ extern void incremement_tail_bytes(apr_uint64_t bytes);
 static bool is_query_not_active(gpmon_qlogkey_t qkey, apr_hash_t *hash, apr_pool_t *pool);
 static void format_time(time_t tt, char *buf);
 static void set_tmid(gp_smon_to_mmon_packet_t* pkt, int32 tmid);
-
+static void update_query_now_metrics(qdnode_t* qdnode, long *spill_file_size);
+static void update_query_history_metrics(qdnode_t* qdnode);
 static bool is_query_not_active(gpmon_qlogkey_t qkey, apr_hash_t *hash, apr_pool_t *pool)
 {
 	// get active query of session
@@ -158,9 +157,7 @@ static apr_status_t agg_put_queryseg(agg_t* agg, const gpmon_query_seginfo_t* me
 
 	/* if found, replace it */
 	if (rec) {
-		rec->final_rowsout = met->final_rowsout;
-		rec->sum_cpu_elapsed += met->sum_cpu_elapsed;
-		rec->sum_measures_rows_out += met->sum_measures_rows_out;
+		rec->sum_cpu_elapsed = met->sum_cpu_elapsed;
 	}
 	else {
 		/* not found, make new hash entry */
@@ -170,9 +167,7 @@ static apr_status_t agg_put_queryseg(agg_t* agg, const gpmon_query_seginfo_t* me
 			return APR_ENOMEM;
 		}
 		memcpy(&rec->key, &met->key, sizeof(gpmon_query_seginfo_key_t));
-		rec->final_rowsout = met->final_rowsout;
 		rec->sum_cpu_elapsed = met->sum_cpu_elapsed;
-		rec->sum_measures_rows_out = met->sum_measures_rows_out;
 
 		apr_hash_set(dp->query_seginfo_hash, &rec->key.segid, sizeof(rec->key.segid), rec);
 	}
@@ -198,31 +193,41 @@ static apr_status_t agg_put_metrics(agg_t* agg, const gpmon_metrics_t* met)
 	return 0;
 }
 
-static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation)
+static apr_status_t agg_put_query_metrics(agg_t* agg, const gpmon_qlog_t* qlog, apr_int64_t generation, char* host_ip)
 {
 	qdnode_t *node;
 
 	node = apr_hash_get(agg->qtab, &qlog->key, sizeof(qlog->key));
+        int* exist;
 	if (!node)
 	{
-		TR2(("put query metrics can not find qdnode from qtab, queryID :%d-%d-%d \n",
-			 tmid, qlog->key.ssid, qlog->key.ccnt));
+		TR2(("put query metrics can not find qdnode from qtab, queryID :%d-%d-%d, Host Ip:%s \n",
+			 qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, host_ip));
 	}
 	if (node)
 	{
-		// here update the stats for the query
-		node->qlog.cpu_elapsed += qlog->cpu_elapsed;
-		node->qlog.p_metrics.cpu_pct += qlog->p_metrics.cpu_pct;
-		node->qlog.p_metrics.fd_cnt  += qlog->p_metrics.fd_cnt;
-		if (qlog->p_metrics.mem.size > node->qlog.p_metrics.mem.size)
-		{
-			node->qlog.p_metrics.mem.size = qlog->p_metrics.mem.size;
-		};
+                exist = apr_hash_get(node->host_hash, host_ip, strlen(host_ip));
+                if(!exist)
+                {
+                        exist = apr_pcalloc(agg->pool, sizeof(int));
+                        *exist = 1;
+                        apr_hash_set(node->host_hash, host_ip, strlen(host_ip), exist);
+                        node->host_cnt++;
+                }
+                else
+                {
+                        ASSERT(*exist == 1);
+                }
+
+                // It is used to calculate the real-time value of the metrics for a small time period of the query.
+                node->p_interval_metrics.cpu_pct += qlog->p_metrics.cpu_pct;
+                node->p_interval_metrics.mem.resident += qlog->p_metrics.mem.resident;
+                node->num_metrics_packets_interval++;
+
 		node->last_updated_generation = generation;
-		node->num_metrics_packets++;
-		TR2(("Query Metrics: (host %s ssid %d ccnt %d) (cpuelapsed %d cpupct %f memsize %lu) / %d\n",
-			 qlog->user, qlog->key.ssid, qlog->key.ccnt, (int) node->qlog.cpu_elapsed, node->qlog.p_metrics.cpu_pct, node->qlog.p_metrics.mem.size,
-			node->num_metrics_packets));
+		TR2(("Query Metrics, Query ID: %d-%d-%d , Host Ip:%s (cpu_pct %f mem_resident %lu), interval pkt:%d, host cnt:%d\n",
+			 qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, host_ip, qlog->p_metrics.cpu_pct, qlog->p_metrics.mem.resident,
+			node->num_metrics_packets_interval, node->host_cnt));
 	}
 	return 0;
 }
@@ -231,7 +236,7 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 				 apr_int64_t generation)
 {
         if (qlog->dbid == gpperfmon_dbid) {
-                TR2(("agg_put_qlog:(%d.%d.%d) ignore gpperfmon sql\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt));
+                TR2(("agg_put_qlog:(%d-%d-%d) ignore gpperfmon sql\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt));
                 return 0;
         }
 
@@ -243,7 +248,7 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 		node->qlog.tsubmit = qlog->tsubmit;
 		node->qlog.tfin = qlog->tfin;
 		if (qlog->dbid != gpperfmon_dbid) {
-			TR2(("agg_put_qlog: found %d.%d.%d generation %d recorded %d\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, (int) generation, node->recorded));
+			TR2(("agg_put_qlog: found %d-%d-%d generation %d recorded %d\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, (int) generation, node->recorded));
 		}
 	} else {
 		node = apr_pcalloc(agg->pool, sizeof(*node));
@@ -253,16 +258,17 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 		node->qlog = *qlog;
 		node->recorded = 0;
 		node->qlog.cpu_elapsed = 0;
-		node->qlog.p_metrics.cpu_pct = 0.0f;
-		node->qlog.p_metrics.fd_cnt = 0;
-		node->qlog.p_metrics.cpu_skew = 0.0f;
-		node->qlog.p_metrics.mem.size = 0;
-                node->qlog.p_metrics.spill_files_size = 0;
-		node->num_metrics_packets = 0;
-
-		node->qexec_hash = apr_hash_make(agg->pool);
-		if (!node->qexec_hash) {
-			TR2(("agg_put_qlog: qexec_hash = apr_hash_make(agg->pool) returned null\n"));
+                memset(&node->qlog.p_metrics, 0, sizeof(node->qlog.p_metrics));
+                memset(&node->p_interval_metrics, 0, sizeof(node->p_interval_metrics));
+                memset(&node->p_queries_history_metrics, 0, sizeof(node->p_queries_history_metrics));
+                memset(&node->p_queries_now_metrics, 0, sizeof(node->p_queries_now_metrics));
+                node->host_cnt = 0;
+		node->num_cpu_pct_interval_total = 0;
+                node->num_metrics_packets_interval = 0;
+
+		node->host_hash = apr_hash_make(agg->pool);
+		if (!node->host_hash) {
+			TR2(("agg_put_qlog: host_hash = apr_hash_make(agg->pool) returned null\n"));
 			return APR_ENOMEM;
 		}
 
@@ -274,7 +280,7 @@ static apr_status_t agg_put_qlog(agg_t* agg, const gpmon_qlog_t* qlog,
 
 		apr_hash_set(agg->qtab, &node->qlog.key, sizeof(node->qlog.key), node);
 		if (qlog->dbid != gpperfmon_dbid) {
-			TR2(("agg_put: new %d.%d.%d generation %d recorded %d\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, (int) generation, node->recorded));
+			TR2(("agg_put: new %d-%d-%d generation %d recorded %d\n", qlog->key.tmid, qlog->key.ssid, qlog->key.ccnt, (int) generation, node->recorded));
 		}
 	}
 	node->last_updated_generation = generation;
@@ -355,7 +361,7 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 		if ( (dp->qlog.status == GPMON_QLOG_STATUS_DONE || dp->qlog.status == GPMON_QLOG_STATUS_ERROR) &&
 				(dp->qlog.tfin > 0 && ((dp->qlog.tfin - dp->qlog.tstart) < min_query_time )))
 		{
-			TR2(("agg_dup: skip short query %d.%d.%d generation %d, current generation %d, recorded %d\n",
+			TR2(("agg_dup: skip short query %d-%d-%d generation %d, current generation %d, recorded %d\n",
 						dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt,
 						(int) dp->last_updated_generation, (int) newagg->generation, dp->recorded));
 			continue;
@@ -371,7 +377,7 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 			{
 				if (dp->qlog.dbid != gpperfmon_dbid)
 				{
-					TR2(("agg_dup: skip %d.%d.%d generation %d, current generation %d, recorded %d\n",
+					TR2(("agg_dup: skip %d-%d-%d generation %d, current generation %d, recorded %d\n",
 						dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt,
 						(int) dp->last_updated_generation, (int) newagg->generation, dp->recorded));
 				}
@@ -380,7 +386,7 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 		}
 
 		if (dp->qlog.dbid != gpperfmon_dbid) {
-			TR2( ("agg_dup: add %d.%d.%d, generation %d, recorded %d:\n", dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt, (int) dp->last_updated_generation, dp->recorded));
+			TR2( ("agg_dup: add %d-%d-%d, generation %d, recorded %d:\n", dp->qlog.key.tmid, dp->qlog.key.ssid, dp->qlog.key.ccnt, (int) dp->last_updated_generation, dp->recorded));
 		}
 
 		/* dup this entry */
@@ -391,11 +397,21 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 
 		*newdp = *dp;
 
+                newdp->num_metrics_packets_interval = 0;
+                newdp->host_cnt = 0;
+                memset(&newdp->p_interval_metrics, 0, sizeof(newdp->p_interval_metrics));
+                memset(&newdp->p_queries_now_metrics, 0, sizeof(newdp->p_queries_now_metrics));
+
 		newdp->query_seginfo_hash = apr_hash_make(newagg->pool);
 		if (!newdp->query_seginfo_hash) {
 			agg_destroy(newagg);
 			return APR_ENOMEM;
 		}
+                newdp->host_hash = apr_hash_make(newagg->pool);
+                if (!newdp->host_hash) {
+                        agg_destroy(newagg);
+                        return APR_ENOMEM;
+                }
 
 		cnt = 0;
 		// Copy the query_seginfo hash table
@@ -410,7 +426,6 @@ apr_status_t agg_dup(agg_t** retagg, agg_t* oldagg, apr_pool_t* parent_pool, apr
 			*new_query_seginfo = *((mmon_query_seginfo_t*)vptr);
 
 			apr_hash_set(newdp->query_seginfo_hash, &(new_query_seginfo->key.segid), sizeof(new_query_seginfo->key.segid), new_query_seginfo);
-			TR2( ("\t    %d: (%d)\n", ++cnt, new_query_seginfo->key.segid));
 		}
 
 		apr_hash_set(newagg->qtab, &newdp->qlog.key, sizeof(newdp->qlog.key), newdp);
@@ -433,7 +448,7 @@ apr_status_t agg_put(agg_t* agg, gp_smon_to_mmon_packet_t* pkt)
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QLOG)
 		return agg_put_qlog(agg, &pkt->u.qlog, agg->generation);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QUERY_HOST_METRICS)
-		return agg_put_query_metrics(agg, &pkt->u.qlog, agg->generation);
+		return agg_put_query_metrics(agg, &pkt->u.qlog, agg->generation, pkt->ipaddr);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_FSINFO)
 		return agg_put_fsinfo(agg, &pkt->u.fsinfo);
 	if (pkt->header.pkttype == GPMON_PKTTYPE_QUERYSEG)
@@ -466,8 +481,7 @@ apr_status_t agg_dump(agg_t* agg)
 	char nowstr[GPMON_DATE_BUF_SIZE];
 	FILE* fp_queries_now = 0;
 	FILE* fp_queries_tail = 0;
-	apr_hash_t *spill_file_tab = NULL;
-
+        apr_hash_t *spill_file_tab = NULL;
 	dbmetrics_t dbmetrics = {0};
 
 	apr_uint32_t temp_bytes_written = 0;
@@ -490,8 +504,8 @@ apr_status_t agg_dump(agg_t* agg)
 	bloom_set(&bloom, GPMON_DIR "diskspace_tail.dat");
 	bloom_set(&bloom, GPMON_DIR "diskspace_stage.dat");
 	bloom_set(&bloom, GPMON_DIR "_diskspace_tail.dat");
-	// get spill file size
-	spill_file_tab = gpdb_get_spill_file_size(agg->pool);
+        // get spill file size
+        spill_file_tab = gpdb_get_spill_file_size(agg->pool);
 
 	/* dump metrics */
 	temp_bytes_written = write_system(agg, nowstr);
@@ -515,23 +529,12 @@ apr_status_t agg_dump(agg_t* agg)
 		qdnode_t* qdnode;
 		apr_hash_this(hi, 0, 0, &vptr);
 		qdnode = vptr;
-		if (spill_file_tab != NULL)
-		{
-			char *key = apr_psprintf(agg->pool, "%d-%d", qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt);
-			long *spill_file_size = apr_hash_get(spill_file_tab, key, APR_HASH_KEY_STRING);
-			if (spill_file_size)
-			{
-				qdnode->qlog.p_metrics.spill_files_size = *spill_file_size;
-			}
-		}
 
 		if (qdnode->qlog.status == GPMON_QLOG_STATUS_DONE || qdnode->qlog.status == GPMON_QLOG_STATUS_ERROR)
 		{
 			if (!qdnode->recorded && ((qdnode->qlog.tfin - qdnode->qlog.tstart) >= min_query_time))
 			{
-				TR1(("queries_tail: %p add query %d.%d.%d, status %d, generation %d, recorded %d\n",
-					 agg->qtab, qdnode->qlog.key.tmid, qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt, qdnode->qlog.status, (int) qdnode->last_updated_generation, qdnode->recorded));
-
+                                update_query_history_metrics(qdnode);
 				temp_bytes_written += write_qlog_full(fp_queries_tail, qdnode, nowstr, agg->pool);
 				incremement_tail_bytes(temp_bytes_written);
 
@@ -583,13 +586,22 @@ apr_status_t agg_dump(agg_t* agg)
 			bloom_set(&bloom, fname);
 		}
 
+                long *spill_file_size = NULL;
+                if (spill_file_tab != NULL)
+                {
+                        char *key = apr_psprintf(agg->pool, "%d-%d", qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt);
+                        spill_file_size = apr_hash_get(spill_file_tab, key, APR_HASH_KEY_STRING);
+                }
+
 		/* write to _query_now.dat */
 		if (qdnode->qlog.status != GPMON_QLOG_STATUS_DONE && qdnode->qlog.status != GPMON_QLOG_STATUS_ERROR)
 		{
+                        update_query_now_metrics(qdnode, spill_file_size);
 			write_qlog(fp_queries_now, qdnode, nowstr, 0);
 		}
 		else if (qdnode->qlog.tfin - qdnode->qlog.tstart >= min_query_time)
 		{
+                        update_query_now_metrics(qdnode, spill_file_size);
 			write_qlog(fp_queries_now, qdnode, nowstr, 1);
 		}
 
@@ -617,7 +629,7 @@ static void delete_old_files(bloom_t* bloom)
 	char findCmd[512] = {0};
 	FILE* fp = NULL;
 	time_t cutoff = time(0) - gpmmon_quantum() * 3;
-	cutoff = cutoff < 10 ? 10 : cutoff;
+        cutoff = cutoff < 10 ? 10 : cutoff;
 
 	/* Need to remove trailing / in dir so find results are consistent
      * between platforms
@@ -899,120 +911,128 @@ static apr_uint32_t write_system(agg_t* agg, const char* nowstr)
 	return bytes_written;
 }
 
-static void _get_sum_seg_info(apr_hash_t* segtab, apr_int64_t* total_data_out, int* segcount_out)
-{
-	apr_hash_index_t *hi;
-	void* valptr;
-	apr_int64_t* seg_data_sum = NULL;
-
-	for (hi = apr_hash_first(NULL, segtab); hi; hi = apr_hash_next(hi))
-	{
-		apr_hash_this(hi, 0, 0, &valptr);
-		seg_data_sum = (apr_int64_t*) valptr;
-		*total_data_out += *seg_data_sum;
-		TR2(("(SKEW) Segment resource usage: %d\n", (int) *seg_data_sum));
-		(*segcount_out)++;
-	}
-}
-
-static void _get_sum_deviation_squared(apr_hash_t* segtab, const apr_int64_t data_avg, apr_int64_t* total_deviation_squared_out)
-{
-	apr_hash_index_t *hi;
-	void* valptr;
-	apr_int64_t* seg_data_sum = NULL;
-
-	for (hi = apr_hash_first(NULL, segtab); hi; hi = apr_hash_next(hi))
-	{
-		apr_int64_t dev = 0;
-
-		apr_hash_this(hi, NULL, NULL, &valptr);
-		seg_data_sum = (apr_int64_t*) valptr;
-		dev = *seg_data_sum - data_avg;
-		TR2(("(SKEW) Deviation: %d\n", (int) dev));
-		*total_deviation_squared_out += dev * dev;
-	}
-}
-
 static double get_cpu_skew(qdnode_t* qdnode)
 {
-	apr_pool_t* tmp_pool;
-	apr_hash_t* segtab;
-	apr_hash_index_t *hi;
-
+        apr_hash_index_t *hi;
 	apr_int64_t cpu_avg = 0;
 	apr_int64_t total_cpu = 0;
-	apr_int64_t total_deviation_squared = 0;
-	double variance = 0;
-	double standard_deviation = 0;
-	double coefficient_of_variation = 0;
-	apr_int64_t* seg_cpu_sum = NULL;
-	void* valptr;
-
+        apr_int64_t max_seg_cpu_sum = 0;
+        double cpu_skew = 0;
 	int segcnt = 0;
-	int e;
+        void* valptr;
 
 	if (!qdnode)
 		return 0.0f;
 
-	if (0 != (e = apr_pool_create_alloc(&tmp_pool, 0)))
-	{
-		gpmon_warningx(FLINE, e, "apr_pool_create_alloc failed");
-		return 0.0f;
-	}
-
-	segtab = apr_hash_make(tmp_pool);
-	if (!segtab)
-	{
-		gpmon_warning(FLINE, "Out of memory");
-		return 0.0f;
-	}
-
-	TR2(("Calc mean per segment\n"));
-
 	for (hi = apr_hash_first(NULL, qdnode->query_seginfo_hash); hi; hi = apr_hash_next(hi))
 	{
 		mmon_query_seginfo_t	*rec;
 		apr_hash_this(hi, 0, 0, &valptr);
 		rec = (mmon_query_seginfo_t*) valptr;
+                if (rec->key.segid == -1)
+                        continue;
 
-		if (rec->key.segid == -1)
-			continue;
+                TR2(("segment cpu elapsed %lu, queryID:%d-%d-%d, segmentID:%d \n",
+                        rec->sum_cpu_elapsed, rec->key.qkey.tmid, rec->key.qkey.ssid, rec->key.qkey.ccnt, rec->key.segid));
 
-		seg_cpu_sum = apr_hash_get(segtab, &rec->key.segid, sizeof(rec->key.segid));
+                if (rec->sum_cpu_elapsed > max_seg_cpu_sum){
+                        max_seg_cpu_sum = rec->sum_cpu_elapsed;
+                };
 
-		if (!seg_cpu_sum) {
-			seg_cpu_sum = apr_palloc(tmp_pool, sizeof(apr_int64_t));
-			*seg_cpu_sum = 0;
-		}
-		*seg_cpu_sum += rec->sum_cpu_elapsed;
-		apr_hash_set(segtab, &rec->key.segid, sizeof(rec->key.segid), seg_cpu_sum);
+                total_cpu += rec->sum_cpu_elapsed;
+                segcnt++;
 	}
 
-	_get_sum_seg_info(segtab, &total_cpu, &segcnt);
-
 	if (!segcnt) {
 		TR2(("No segments for CPU skew calculation\n"));
-		apr_pool_destroy(tmp_pool);
 		return 0.0f;
 	}
 
 	cpu_avg = total_cpu / segcnt;
-	TR2(("(SKEW) Avg resource usage: %" FMT64 "\n", cpu_avg));
+        cpu_skew = 1 - (cpu_avg / max_seg_cpu_sum);
+        TR2(("(SKEW) queryID:%d-%d-%d, Avg cpu usage: %" FMT64 ", Max segment cpu sum : %" FMT64 ", cpu skew : %lf \n",
+                qdnode->qlog.key.tmid, qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt, cpu_avg, max_seg_cpu_sum, cpu_skew));
+        return cpu_skew;
+}
+
 
-	_get_sum_deviation_squared(segtab, cpu_avg, &total_deviation_squared);
+/*
+ *  The update_query_now_metrics function is used to update qdnode.p_queries_now_metrics.
+ *  p_queries_now_metrics is calculated from p_interval_metrics and then written into the queries_now table.
+ *
+ *  cpu_skew: Since cpu_elapsed is an accumulated value, the cpu_skew is calculated directly using this accumulated value each time.
+ *
+ *  spill_file_size: The value is obtained in real - time by querying the gp_workfile_usage_per_query table.
+ *  Meanwhile, the maximum value of this value is recorded in p_queries_history_metrics for subsequent writing into the queries_tail table.
+ *
+ *  cpu_pct: Within a time window, multiple segment hosts may send multiple packets to gpmmon. At this time, the average value within the time window needs to be calculated.
+ *  Therefore, the accumulated cpu_pct value should be divided by (the total number of received packets / the number of hosts that send packets).
+ *  For example, suppose three segment hosts send a total of 9 packets during a certain period. After gpmmon accumulates these 9 packets, the actual cpu_pct should be sum_cpu_pct/(9/3).
+ *  Here, 9/3 means that three groups of packets are received within this time window (the sum of the packets sent by the three segment hosts is regarded as one group).
+ *
+ *  mem.resident: Using resident can more accurately reflect the actual physical memory value used for executing the query.
+ *  Its calculation method is the same as that of cpu_pct, which will not be repeated here.
+ *  At the same time, the maximum value of mem.resident is recorded for subsequent writing into the queries_tail table.
+ */
+static void update_query_now_metrics(qdnode_t* qdnode, long *spill_file_size)
+{
+        qdnode->p_queries_now_metrics.cpu_skew = get_cpu_skew(qdnode);
 
-	variance = total_deviation_squared / (double)segcnt;
+        if (spill_file_size != NULL)
+        {
+                qdnode->p_queries_now_metrics.spill_files_size = *spill_file_size;
+        }
 
-	standard_deviation = sqrt(variance);
+        if (qdnode->p_queries_now_metrics.spill_files_size > 0 && qdnode->p_queries_now_metrics.spill_files_size > qdnode->p_queries_history_metrics.spill_files_size)
+        {
+                qdnode->p_queries_history_metrics.spill_files_size = qdnode->p_queries_now_metrics.spill_files_size;
+                TR2(("(SPILL FILE) queryID:%d-%d-%d, spill file size peak: %lu \n", qdnode->qlog.key.tmid,
+                        qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt, qdnode->p_queries_history_metrics.spill_files_size));
+        }
 
-	TR2(("(SKEW) CPU standard deviation: %f\n", standard_deviation));
+        if (qdnode->num_metrics_packets_interval && qdnode->host_cnt)
+        {
+                qdnode->p_queries_now_metrics.cpu_pct = qdnode->p_interval_metrics.cpu_pct / (qdnode->num_metrics_packets_interval / qdnode->host_cnt);
+                qdnode->p_queries_history_metrics.cpu_pct += qdnode->p_queries_now_metrics.cpu_pct;
+                qdnode->num_cpu_pct_interval_total++;
 
-	coefficient_of_variation = cpu_avg ? standard_deviation/(double)cpu_avg : 0.0f;
+                qdnode->p_queries_now_metrics.mem.resident = qdnode->p_interval_metrics.mem.resident / (qdnode->num_metrics_packets_interval / qdnode->host_cnt);
+                if (qdnode->p_queries_now_metrics.mem.resident > qdnode->p_queries_history_metrics.mem.resident){
+                        qdnode->p_queries_history_metrics.mem.resident = qdnode->p_queries_now_metrics.mem.resident;
+                }
+        }
+        else
+        {
+                qdnode->p_queries_now_metrics.cpu_pct = 0;
+                qdnode->p_queries_now_metrics.mem.resident = 0;
+        }
+}
 
-	apr_pool_destroy(tmp_pool);
-	TR2(("(SKEW) CPU Skew: %f\n", coefficient_of_variation));
 
-	return coefficient_of_variation;
+/*
+ *  The update_query_history_metrics function is used to update qdnode.p_queries_history_metrics.
+ *  p_queries_history_metrics is calculated from p_query_now_metrics and then written into the queries_now table.
+ *
+ *  cpu_skew: As cpu_elapsed is an accumulated value, the cpu_skew is directly calculated using this accumulated value each time.
+ *
+ *  spill_file_size: The maximum value of the spill file size is recorded in the queries_tail table.
+ *
+ *  cpu_pct: The average value of cpu_pct throughout the entire lifecycle of this query is obtained by
+ *  dividing the cumulative value of cpu_pct in each previous time window by the total number of time windows.
+ *
+ *  mem.resident:  The maximum value of mem.resident is recorded in the queries_tail table.
+ */
+static void update_query_history_metrics(qdnode_t* qdnode)
+{
+        qdnode->p_queries_history_metrics.cpu_skew = get_cpu_skew(qdnode);
+        if (qdnode->num_cpu_pct_interval_total)
+        {
+                qdnode->p_queries_history_metrics.cpu_pct = qdnode->p_queries_history_metrics.cpu_pct / qdnode->num_cpu_pct_interval_total;
+        }
+        else
+        {
+                qdnode->p_queries_history_metrics.cpu_pct = 0.0f;
+        }
 }
 
 static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const char* nowstr, apr_uint32_t done)
@@ -1020,17 +1040,9 @@ static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const ch
 	char timsubmitted[GPMON_DATE_BUF_SIZE];
 	char timstarted[GPMON_DATE_BUF_SIZE];
 	char timfinished[GPMON_DATE_BUF_SIZE];
-	double cpu_skew = 0.0f;
-	double row_skew = 0.0f;
-	int query_hash = 0;
-	apr_int64_t rowsout = 0;
-	float cpu_current;
-	int   fd_cnt;
-	cpu_skew = get_cpu_skew(qdnode);
-	qdnode->qlog.p_metrics.cpu_skew += cpu_skew;
-	//row_skew = get_row_skew(qdnode);
-	//rowsout = get_rowsout(qdnode);
-
+        double row_skew = 0.0f;
+        int query_hash = 0;
+        apr_int64_t rowsout = 0;
 	if (qdnode->qlog.tsubmit)
 	{
 		gpmon_datetime((time_t)qdnode->qlog.tsubmit, timsubmitted);
@@ -1058,20 +1070,11 @@ static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const ch
 		snprintf(timfinished, GPMON_DATE_BUF_SIZE,  "null");
 	}
 
-
-	if (qdnode->num_metrics_packets)
-	{
-		// average cpu_pct per reporting machine
-		cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
-		fd_cnt = qdnode->qlog.p_metrics.fd_cnt / qdnode->num_metrics_packets;
-		cpu_skew = qdnode->qlog.p_metrics.cpu_skew / qdnode->num_metrics_packets;
-	}
-	else
-	{
-		cpu_current = 0.0f;
-		fd_cnt  = 0;
-		cpu_skew = 0.0f;
-	}
+        TR2(("fmt qlog to queries_now , queryID:%d-%d-%d, cpu pct:%f, mem_resident:%lu, spill_files_size:%lu, cpu_skew:%lf, segment host cnt:%d, pkt nums:%d\n",
+                qdnode->qlog.key.tmid, qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt,
+                qdnode->p_queries_now_metrics.cpu_pct, qdnode->p_queries_now_metrics.mem.resident,
+                qdnode->p_queries_now_metrics.spill_files_size, qdnode->p_queries_now_metrics.cpu_skew,
+                qdnode->host_cnt, qdnode->num_metrics_packets_interval));
 
 	snprintf(line, line_size, "%s|%d|%d|%d|%d|%s|%u|%d|%s|%s|%s|%s|%" FMT64 "|%" FMT64 "|%.4f|%.2f|%.2f|%d||||||%" FMTU64 "|%" FMTU64 "|%d|%d",
 		nowstr,
@@ -1088,12 +1091,12 @@ static void fmt_qlog(char* line, const int line_size, qdnode_t* qdnode, const ch
 		gpmon_qlog_status_string(qdnode->qlog.status),
 		rowsout,
 		qdnode->qlog.cpu_elapsed,
-		cpu_current,
-		cpu_skew,
+                qdnode->p_queries_now_metrics.cpu_pct,
+                qdnode->p_queries_now_metrics.cpu_skew,
 		row_skew,
 		query_hash,
-                qdnode->qlog.p_metrics.mem.size,
-                qdnode->qlog.p_metrics.spill_files_size,
+                qdnode->p_queries_now_metrics.mem.resident,
+                qdnode->p_queries_now_metrics.spill_files_size,
                 0,
                 0
                 );
@@ -1252,35 +1255,15 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
         char timsubmitted[GPMON_DATE_BUF_SIZE];
         char timstarted[GPMON_DATE_BUF_SIZE];
         char timfinished[GPMON_DATE_BUF_SIZE];
-        double cpu_skew = 0.0f;
         double row_skew = 0.0f;
         int query_hash = 0;
         apr_int64_t rowsout = 0;
-        float cpu_current;
-        int   fd_cnt;
-        cpu_skew = get_cpu_skew(qdnode);
-        qdnode->qlog.p_metrics.cpu_skew += cpu_skew;
-
 		format_time(qdnode->qlog.tsubmit, timsubmitted);
 		format_time(qdnode->qlog.tstart, timstarted);
 		format_time(qdnode->qlog.tfin, timfinished);
 
-        if (qdnode->num_metrics_packets)
-        {
-			// average cpu_pct per reporting machine
-			cpu_current = qdnode->qlog.p_metrics.cpu_pct / qdnode->num_metrics_packets;
-			fd_cnt = qdnode->qlog.p_metrics.fd_cnt / qdnode->num_metrics_packets;
-			cpu_skew = qdnode->qlog.p_metrics.cpu_skew / qdnode->num_metrics_packets;
-		}
-		else
-		{
-			cpu_current = 0.0f;
-			fd_cnt = 0;
-			cpu_skew = 0.0f;
-		}
-
-		// get query text and plan
-		char* array[5] = {"", "", "", "", ""};
+        // get query text and plan
+        char* array[5] = {"", "", "", "", ""};
         const int qfname_size = 256;
         char qfname[qfname_size];
         int size = 0;
@@ -1303,6 +1286,11 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
 			gpmon_warning(FLINE, "missing expected qyuery file: %s", qfname);
 		}
 
+        TR2(("fmt qlog to queries_tail, queryID:%d-%d-%d, cpu pct:%f, mem_resident_peak:%lu, spill_files_size_peak:%lu, cpu_skew:%lf, cpu_pct interval nums:%d\n",
+                qdnode->qlog.key.tmid, qdnode->qlog.key.ssid, qdnode->qlog.key.ccnt,
+                qdnode->p_queries_history_metrics.cpu_pct, qdnode->p_queries_history_metrics.mem.resident,
+                qdnode->p_queries_history_metrics.spill_files_size, qdnode->p_queries_history_metrics.cpu_skew, qdnode->num_cpu_pct_interval_total));
+
         int line_size = (1024+size)*sizeof(char);
         char* line = apr_palloc(pool,line_size);
         memset(line,0,line_size);
@@ -1321,8 +1309,8 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
                 gpmon_qlog_status_string(qdnode->qlog.status),
                 rowsout,
                 qdnode->qlog.cpu_elapsed,
-                cpu_current,
-                cpu_skew,
+                qdnode->p_queries_history_metrics.cpu_pct,
+                qdnode->p_queries_history_metrics.cpu_skew,
                 row_skew,
                 query_hash,
                 array[0],
@@ -1330,8 +1318,8 @@ static apr_uint32_t write_qlog_full(FILE* fp, qdnode_t *qdnode, const char* nows
                 array[2],
                 array[3],
                 array[4],
-                qdnode->qlog.p_metrics.mem.size,
-                qdnode->qlog.p_metrics.spill_files_size,
+                qdnode->p_queries_history_metrics.mem.resident,
+                qdnode->p_queries_history_metrics.spill_files_size,
                 0,
                 0
         );
diff --git a/contrib/perfmon/src/gpmmon/gpmon_agg.h b/contrib/perfmon/src/gpmmon/gpmon_agg.h
index c3757ea9724..fd3829d8d24 100644
--- a/contrib/perfmon/src/gpmmon/gpmon_agg.h
+++ b/contrib/perfmon/src/gpmmon/gpmon_agg.h
@@ -13,9 +13,24 @@ apr_status_t agg_dump(agg_t* agg);
 typedef struct qdnode_t {
         apr_int64_t last_updated_generation;
         int recorded;
-        int num_metrics_packets;
+
+        // num_cpu_pct_interval_total represents how many time windows cpu_pct has gone through
+        // from being written into queries_now to being written into queries_tail.
+        int num_cpu_pct_interval_total;
+
+        //The total number of packets received in a certain time window. When the metrics are written into queries_now, this value will be reset to 0.
+        //The definition of a time window is the period from the last time when queries_now was written to the current time when queries_now is written.
+        int num_metrics_packets_interval;
+
+        //The p_interval_metrics records the instantaneous values of the metrics within a certain time window.
+        gpmon_proc_metrics_t p_interval_metrics;
+        //The p_queries_now_metrics is calculated from p_interval_metrics and then written into the queries_now table.
+        gpmon_proc_metrics_t p_queries_now_metrics;
+        //The p_queries_history_metrics is calculated from p_now_metrics and then written into the queries_tail table.
+        gpmon_proc_metrics_t p_queries_history_metrics;
+        int host_cnt;
         gpmon_qlog_t qlog;
-        apr_hash_t* qexec_hash;
-        apr_hash_t*	query_seginfo_hash;
+        apr_hash_t* host_hash;
+        apr_hash_t* query_seginfo_hash;
 } qdnode_t;
 #endif
diff --git a/contrib/perfmon/src/gpmon/gpmon.c b/contrib/perfmon/src/gpmon/gpmon.c
index 45a7bfc9e18..0b64a6d8309 100644
--- a/contrib/perfmon/src/gpmon/gpmon.c
+++ b/contrib/perfmon/src/gpmon/gpmon.c
@@ -547,7 +547,6 @@ gpmon_query_info_collect_hook(QueryMetricsStatus status, void *queryDesc)
 			switch (status)
 			{
 			case METRICS_QUERY_START:
-			case METRICS_PLAN_NODE_EXECUTING:
 				gpmon_send(gpmonPacket);
 				break;
 			default:
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index cca33c92183..2abb4849af3 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -24,6 +24,7 @@
 
 void update_log_filename(void);
 void gx_main(int, apr_int64_t);
+static void agg_metrics(apr_hash_t* pidtab, apr_hash_t* query_cpu_table, apr_hash_t* querysegtab, apr_pool_t* oldpool);
 
 /* Temporary global memory to store the qexec line for a send*/
 char	qexec_smon_temp_line[QEXEC_MAX_ROW_BUF_SIZE];
@@ -64,6 +65,7 @@ struct pidrec_t
 	gpmon_qlogkey_t query_key;
         gpmon_query_seginfo_key_t qseg_key;
         gpmon_qexec_hash_key_t hash_key;
+        gpmon_pidtable_key_t  pidtable_key;
 };
 
 typedef struct gx_t gx_t;
@@ -262,16 +264,18 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 	pidrec_t* rec;
 	apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
 
-	rec = apr_hash_get(gx.pidtab, &key.pid, sizeof(key.pid));
-	if (rec && rec->updated_tick == gx.tick)
+        gpmon_pidtable_key_t pidtable_key;
+        pidtable_key.pid = key.pid;
+        pidtable_key.qkey.tmid = tmid;
+        pidtable_key.qkey.ssid = ssid;
+        pidtable_key.qkey.ccnt = ccnt;
+	rec = apr_hash_get(gx.pidtab, &pidtable_key, sizeof(pidtable_key));
+        if (rec && rec->updated_tick == gx.tick)
 		return; /* updated in current cycle */
-
 	memset(&cpu, 0, sizeof(cpu));
 	memset(&mem, 0, sizeof(mem));
 	memset(&fd, 0, sizeof(fd));
 
-	TR2(("--------------------- starting %d\n", key.pid));
-
 	if (!rec)
 	{
 		sigar_proc_exe_t exe;
@@ -295,6 +299,8 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
                 rec->qseg_key = qseg_key;
                 rec->hash_key = key;
 
+                rec->pidtable_key = pidtable_key;
+
 		rec->pname = rec->cwd = 0;
 		if (0 == sigar_proc_exe_get(gx.sigar, key.pid, &exe))
 		{
@@ -306,7 +312,7 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 		if (!rec->cwd)
 			rec->cwd = "unknown";
 
-		apr_hash_set(gx.pidtab, &rec->pid, sizeof(rec->pid), rec);
+		apr_hash_set(gx.pidtab, &rec->pidtable_key, sizeof(rec->pidtable_key), rec);
 	}
 
 	status = sigar_proc_mem_get(gx.sigar, key.pid, &mem);
@@ -314,7 +320,7 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
 		}
 		return;
 	}
@@ -323,7 +329,7 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
 		}
 		return;
 	}
@@ -332,7 +338,7 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
 		}
 		return;
 	}
@@ -342,7 +348,7 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 	{
 		if (status != ESRCH)
                 {
-			TR2(("[WARNING] %s. PID: %d\n", sigar_strerror(gx.sigar, status), key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
 		}
 		return;
 	}
@@ -360,6 +366,7 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 #endif
 
 	rec->cpu_elapsed = cpu.total;
+        TR2(("%s:---pid(%d) query id(%d-%d-%d) metrics refresh finished---\n", FLINE, key.pid, tmid, ssid, ccnt));
 }
 
 
@@ -663,6 +670,7 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 	apr_pool_t* oldpool;
 	apr_hash_t* qdtab;
 	apr_hash_t* pidtab;
+        apr_hash_t* querysegtab;
 	if (event & EV_TIMEOUT) // didn't get command from gpmmon, quit
 	{
 		if(gx.tcp_sock)
@@ -674,7 +682,6 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		}
 		return;
 	}
-	apr_hash_t* querysegtab;
 	n = recv(sock, &dump, 1, 0);
 	if (n == 0)
 		gx_exit("peer closed");
@@ -726,7 +733,6 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		pidrec_t* pidrec;
 		int count = 0;
 		apr_hash_t* query_cpu_table = NULL;
-                sigar_proc_state_t state;
 
 		for (hi = apr_hash_first(0, qdtab); hi; hi = apr_hash_next(hi))
 		{
@@ -736,7 +742,7 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			if (ppkt->header.pkttype != GPMON_PKTTYPE_QLOG)
 				continue;
 
-			TR2(("%s: sending magic %x, pkttype %d, %d-%d-%d\n", FLINE, ppkt->header.magic, ppkt->header.pkttype,
+			TR2(("%s: sending magic %x, pkttype GPMON_PKTTYPE_QLOG, %d-%d-%d\n", FLINE, ppkt->header.magic,
                                 ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt));
 			send_smon_to_mon_pkt(sock, ppkt);
 			count++;
@@ -746,81 +752,8 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		query_cpu_table = apr_hash_make(oldpool);
 		CHECKMEM(query_cpu_table);
 
-		// loop through PID's and add to Query CPU Hash Table
-		for (hi = apr_hash_first(0, pidtab); hi; hi = apr_hash_next(hi))
-		{
-			void* vptr;
-			pidrec_t* queryMetric;
-			pidrec_t *pidrec;
-
-			apr_hash_this(hi, 0, 0, &vptr);
-			pidrec = vptr;
-			if (!pidrec)
-			{
-				continue;
-			}
-
-			TR2(("%s: %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
-				FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
-				pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct, pidrec->p_metrics.mem.size));
-
-			// table is keyed on query key
-			queryMetric = apr_hash_get(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key));
-
-			if (queryMetric)
-			{
-				// found other pids with same query key so add the metrics to that
-
-				queryMetric->cpu_elapsed += pidrec->cpu_elapsed;
-				queryMetric->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
-				queryMetric->p_metrics.fd_cnt += pidrec->p_metrics.fd_cnt;
-				queryMetric->p_metrics.mem.resident += pidrec->p_metrics.mem.resident;
-				queryMetric->p_metrics.mem.size += pidrec->p_metrics.mem.size;
-				queryMetric->p_metrics.mem.share += pidrec->p_metrics.mem.share;
-				TR2(("%s: increase %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
-					 FLINE, queryMetric->query_key.tmid, queryMetric->query_key.ssid, queryMetric->query_key.ccnt, queryMetric->pid,
-					 queryMetric->cpu_elapsed, queryMetric->p_metrics.cpu_pct, queryMetric->p_metrics.mem.size));
-			}
-			else
-			{
-				// insert existing pid record into table keyed by query key
-				queryMetric = apr_palloc(oldpool, sizeof(pidrec_t));
-				memcpy(queryMetric, pidrec, sizeof(pidrec_t));
-				apr_hash_set(query_cpu_table, &queryMetric->query_key, sizeof(gpmon_qlogkey_t), queryMetric);
-			}
-
-			// add to queryseg hash table
-			gp_smon_to_mmon_packet_t *rec;
-			rec = apr_hash_get(querysegtab, &pidrec->qseg_key, sizeof(pidrec->qseg_key));
-			if (rec)
-			{
-				rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
-			}
-			else
-			{
-				rec = apr_palloc(apr_hash_pool_get(querysegtab), sizeof(gp_smon_to_mmon_packet_t));
-				CHECKMEM(rec);
-				gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
-				rec->u.queryseg.key = pidrec->qseg_key;
-				rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
-				apr_hash_set(querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
-			}
-
-			//add to new pidtab if process is exist
-			int status = sigar_proc_state_get(gx.sigar,pidrec->pid, &state);
-			if (status == SIGAR_OK)
-			{
-				apr_pool_t *pool = apr_hash_pool_get(gx.pidtab);
-				pidrec_t *newpidrec = apr_palloc(pool, sizeof(*pidrec));
-				memcpy(newpidrec, pidrec, sizeof(*pidrec));
-				apr_hash_set(gx.pidtab, &newpidrec->pid, sizeof(newpidrec->pid), newpidrec);
-				TR2(("%s: %d-%d-%d pid %d add to new pidtab \n",
-					 FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid));
-				continue;
-			}
-			TR2(("%s: %d-%d-%d pid %d pid status %d not add to new pidtab \n",
-				 FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid, status));
-		}
+                // Iterate the PID's hash table 'pidtab' to update query_cpu hash table and query_seg hash table
+                agg_metrics(pidtab, query_cpu_table, querysegtab, oldpool);
 
 		/*
 		 * QUERYSEG packets must be sent after QLOG packets so that gpmmon can
@@ -834,7 +767,7 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 			if (ppkt->header.pkttype != GPMON_PKTTYPE_QUERYSEG)
 				continue;
 
-			TR2(("%s: sending magic %x, pkttype %d, %d-%d-%d\n", FLINE, ppkt->header.magic, ppkt->header.pkttype,
+			TR2(("%s: sending magic %x, pkttype GPMON_PKTTYPE_QUERYSEG, %d-%d-%d\n", FLINE, ppkt->header.magic,
 				 ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt));
 			send_smon_to_mon_pkt(sock, ppkt);
 			count++;
@@ -866,7 +799,7 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 
 			TR2(("%s: SEND %d-%d-%d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
 				FLINE, ppkt->u.qlog.key.tmid, ppkt->u.qlog.key.ssid, ppkt->u.qlog.key.ccnt,
-				ppkt->u.qlog.cpu_elapsed, ppkt->u.qlog.p_metrics.cpu_pct, ppkt->u.qlog.p_metrics.mem.size));
+				ppkt->u.qlog.cpu_elapsed, ppkt->u.qlog.p_metrics.cpu_pct, ppkt->u.qlog.p_metrics.mem.resident));
 
 			send_smon_to_mon_pkt(sock, ppkt);
 			count++;
@@ -892,6 +825,86 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 	return;
 }
 
+static void agg_metrics(apr_hash_t* pidtab, apr_hash_t* query_cpu_table, apr_hash_t* querysegtab, apr_pool_t* oldpool){
+        sigar_proc_state_t state;
+        apr_hash_index_t* hi;
+        for (hi = apr_hash_first(0, pidtab); hi; hi = apr_hash_next(hi))
+        {
+                void* vptr;
+                pidrec_t* queryMetric;
+                pidrec_t *pidrec;
+
+                apr_hash_this(hi, 0, 0, &vptr);
+                pidrec = vptr;
+                if (!pidrec)
+                {
+                        continue;
+                }
+
+                TR2(("%s: %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
+                        FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
+                        pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct, pidrec->p_metrics.mem.resident));
+
+                // add to query_cpu_table，aggregate by tmid-ssid-ccnt
+                queryMetric = apr_hash_get(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key));
+
+                if (queryMetric)
+                {
+                        // found other pids with same query key so add the metrics to that
+
+                        queryMetric->cpu_elapsed += pidrec->cpu_elapsed;
+                        queryMetric->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
+                        queryMetric->p_metrics.fd_cnt += pidrec->p_metrics.fd_cnt;
+                        queryMetric->p_metrics.mem.resident += pidrec->p_metrics.mem.resident;
+                        queryMetric->p_metrics.mem.size += pidrec->p_metrics.mem.size;
+                        queryMetric->p_metrics.mem.share += pidrec->p_metrics.mem.share;
+                        TR2(("%s: increase %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
+                                FLINE, queryMetric->query_key.tmid, queryMetric->query_key.ssid, queryMetric->query_key.ccnt, queryMetric->pid,
+                                queryMetric->cpu_elapsed, queryMetric->p_metrics.cpu_pct, queryMetric->p_metrics.mem.resident));
+                }
+                else
+                {
+                        // insert existing pid record into table keyed by query key
+                        queryMetric = apr_palloc(oldpool, sizeof(pidrec_t));
+                        memcpy(queryMetric, pidrec, sizeof(pidrec_t));
+                        apr_hash_set(query_cpu_table, &queryMetric->query_key, sizeof(gpmon_qlogkey_t), queryMetric);
+                }
+
+                // add to queryseg hash table, aggregate by tmid-ssid-ccnt、segid
+                gp_smon_to_mmon_packet_t *rec;
+                rec = apr_hash_get(querysegtab, &pidrec->qseg_key, sizeof(pidrec->qseg_key));
+                if (rec)
+                {
+                        rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
+                }
+                else
+                {
+                        rec = apr_palloc(apr_hash_pool_get(querysegtab), sizeof(gp_smon_to_mmon_packet_t));
+                        CHECKMEM(rec);
+                        gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
+                        rec->u.queryseg.key = pidrec->qseg_key;
+                        rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
+                        apr_hash_set(querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
+                }
+
+                //add to new pidtab if process is exist
+                int status = sigar_proc_state_get(gx.sigar,pidrec->pid, &state);
+                if (status == SIGAR_OK)
+                {
+                        apr_pool_t *pool = apr_hash_pool_get(gx.pidtab);
+                        pidrec_t *newpidrec = apr_palloc(pool, sizeof(*pidrec));
+                        memcpy(newpidrec, pidrec, sizeof(*pidrec));
+                        apr_hash_set(gx.pidtab, &newpidrec->pidtable_key, sizeof(newpidrec->pidtable_key), newpidrec);
+                        TR2(("%s: %d-%d-%d pid %d add to new pidtab \n",
+                                FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid));
+                        continue;
+                }
+                TR2(("%s: %d-%d-%d pid %d pid status %d not add to new pidtab \n",
+                        FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid, status));
+        }
+        return;
+}
+
 static void gx_accept(SOCKET sock, short event, void* arg)
 {
 	SOCKET nsock;
@@ -1014,18 +1027,11 @@ static void gx_recvqexec(gpmon_packet_t* pkt)
 	if (pkt->pkttype != GPMON_PKTTYPE_QEXEC)
 		gpsmon_fatal(FLINE, "assert failed; expected pkttype qexec");
 
-        TR2(("%s received qexec packet %d-%d-%d pid %d\n", FLINE, pkt->u.qlog.key.tmid, pkt->u.qlog.key.ssid, pkt->u.qlog.key.ccnt, pkt->u.qlog.pid));
+        TR2(("%s received qexec packet(query id :%d-%d-%d, segid: %d, pid %d)\n",
+                FLINE, pkt->u.qexec.key.qkey.tmid, pkt->u.qexec.key.qkey.ssid, pkt->u.qexec.key.qkey.ccnt, pkt->u.qexec.key.hash_key.segid, pkt->u.qexec.key.hash_key.pid));
 
 	p = &pkt->u.qexec;
-	get_pid_metrics(p->key.hash_key,
-					p->key.qkey.tmid,
-					p->key.qkey.ssid,
-					p->key.qkey.ccnt);
-	// Store some aggregated information somewhere for metrics in
-	// queries_* tables, like cpu_elapsed, rows_out, and etc.
-	//extract_segments_exec(pkt);
-	// We don't call gpmon_warning here because the number of
-	// packet is big, and we would make log boating.
+	get_pid_metrics(p->key.hash_key, p->key.qkey.tmid, p->key.qkey.ssid, p->key.qkey.ccnt);
 	return;
 }
 
@@ -1543,8 +1549,8 @@ void gx_main(int port, apr_int64_t signature)
 			rec = vptr;
 			if (rec)
 			{
-				TR2(("%s: %d-%d-%d pid %d refresh process metrics \n ",
-					 FLINE, rec->query_key.tmid, rec->query_key.ssid, rec->query_key.ccnt, rec->pid));
+                                TR2(("%s:----start to refresh pid(%d) query id(%d-%d-%d) metrics in main loop----\n",
+                                        FLINE, rec->pid, rec->query_key.tmid, rec->query_key.ssid, rec->query_key.ccnt));
 				get_pid_metrics(rec->hash_key,
 								rec->query_key.tmid,
 								rec->query_key.ssid,
diff --git a/contrib/perfmon/src/include/gpmon.h b/contrib/perfmon/src/include/gpmon.h
index 3fd1b364a76..aa2c2ce425f 100644
--- a/contrib/perfmon/src/include/gpmon.h
+++ b/contrib/perfmon/src/include/gpmon.h
@@ -176,6 +176,11 @@ typedef struct gpmon_qexec_hash_key_t {
 	int16 nid;	/* plan node id */
 }gpmon_qexec_hash_key_t;
 
+typedef struct gpmon_pidtable_key_t {
+        int32 pid; 	/* process id */
+        gpmon_qlogkey_t qkey;
+}gpmon_pidtable_key_t;
+
 /* XXX According to CK.
  * QE will NOT need to touch anything begin with _
  */
diff --git a/contrib/perfmon/src/include/gpmonlib.h b/contrib/perfmon/src/include/gpmonlib.h
index 7d35557fac6..3ae9f13be11 100644
--- a/contrib/perfmon/src/include/gpmonlib.h
+++ b/contrib/perfmon/src/include/gpmonlib.h
@@ -30,7 +30,7 @@ extern int verbose;
 /* TODO: REMOVE */
 //extern int very_verbose;
 #define TR0(x) gpmon_print x
-#define TR1(x) if (verbose == 1) gpmon_print x
+#define TR1(x) if (verbose >= 1) gpmon_print x
 #define TR2(x) if (verbose == 2) gpmon_print x
 #define TR1_FILE(x) if (verbose == 1) gpmon_print_file x
 
@@ -220,6 +220,7 @@ typedef struct gp_smon_to_mmon_header_t {
 
 typedef struct gp_smon_to_mmon_packet_t {
 	gp_smon_to_mmon_header_t header;
+        char* ipaddr;
 	union {
 		gpmon_hello_t   hello;
 		gpmon_metrics_t metrics;

From 49740f90ced8a02314218c79a3eecd776681b513 Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Fri, 17 Jan 2025 11:02:21 +0800
Subject: [PATCH 35/40] [Perfmon] Increase the size of the table to avoid the
 situation where the collected metrics are 0.

---
 contrib/perfmon/expected/query.out | 14 +++++++-------
 contrib/perfmon/sql/query.sql      |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index d04e081561d..6c1eed4146d 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -30,7 +30,7 @@ select wait_for_gpmmon_work();
 select sess_id from pg_stat_activity where pg_backend_pid()=pid;
  sess_id 
 ---------
-    7316
+  401542
 (1 row)
 
 \gset
@@ -41,15 +41,15 @@ HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sur
 CREATE TABLE test(a int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-INSERT INTO foo SELECT generate_series(0,30000000);
-INSERT INTO test SELECT generate_series(0,30000000);
+INSERT INTO foo SELECT generate_series(0,40000000);
+INSERT INTO test SELECT generate_series(0,40000000);
 -- test query text in multiple lines
 INSERT INTO test
 SELECT generate_series(0,10);
 select count(*) from foo,test where foo.a=test.a;
   count   
 ----------
- 30000012
+ 40000012
 (1 row)
 
 -- test nested query
@@ -61,7 +61,7 @@ $$ language plpgsql;
 select * from n_join_foo_test();
  n_join_foo_test 
 -----------------
-        30000012
+        40000012
 (1 row)
 
 DROP TABLE foo;
@@ -121,8 +121,8 @@ where ssid = :sess_id order by ccnt;
 ------+--------+------------------------------------------------------------------+----------
     2 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
     4 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
-    8 | done   | INSERT INTO foo SELECT generate_series(0,30000000);              | t
-   10 | done   | INSERT INTO test SELECT generate_series(0,30000000);             | t
+    8 | done   | INSERT INTO foo SELECT generate_series(0,40000000);              | t
+   10 | done   | INSERT INTO test SELECT generate_series(0,40000000);             | t
    12 | done   | INSERT INTO test                                                +| t
       |        | SELECT generate_series(0,10);                                    | 
    14 | done   | select count(*) from foo,test where foo.a=test.a;                | t
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index f788f3c76ae..d3410f0260b 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -28,8 +28,8 @@ select sess_id from pg_stat_activity where pg_backend_pid()=pid;
 
 CREATE TABLE foo(a int);
 CREATE TABLE test(a int);
-INSERT INTO foo SELECT generate_series(0,30000000);
-INSERT INTO test SELECT generate_series(0,30000000);
+INSERT INTO foo SELECT generate_series(0,40000000);
+INSERT INTO test SELECT generate_series(0,40000000);
 -- test query text in multiple lines
 INSERT INTO test
 SELECT generate_series(0,10);

From 2cdf4f038e8437792cedc07aed89d6914ff8209e Mon Sep 17 00:00:00 2001
From: huluhuifeng <huluhuifeng@hashdata.cn>
Date: Fri, 14 Feb 2025 19:49:56 +0800
Subject: [PATCH 36/40] Perfmon: ignore unstable test

---
 contrib/perfmon/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index b64484917e1..c385244df56 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -1,7 +1,7 @@
 NAME = perfmon
 EXTVERSION = 1.1.0
 
-REGRESS = pre_run_check guc_config query extension_test pg_qs post_run
+REGRESS = pre_run_check guc_config extension_test pg_qs post_run
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config

From 2543258c15e4c25849aa5da975c66aa6ccb7eb6d Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Mon, 24 Feb 2025 16:53:55 +0800
Subject: [PATCH 37/40] Fix the proc cpu percent got by gpsmon

The proc cpu percent got by gpsmon sometimes is 0 although
the proc cpu cost is not 0. The bug is caused by the
pidtab hash table's key. The key is not pid, but defined
as queryid + pid. In the case a session runs a bunch of
queries, there will be multiple entries in the pidtable
which have the same pid, and the cpu percent will be computed
multiple times for a proc. The cpu percent is computed as
nowcpucost-lastcpucost/timemow-lasttime, then the
nowcpucost-lastcpucost is 0 if it is collected within
a minimal timeframe. Fix that by using pid as the hash
table's key.

By the way, did a small refactor to make the code more
clean.
---
 contrib/perfmon/Makefile            |   2 +-
 contrib/perfmon/expected/query.out  |  48 ++---
 contrib/perfmon/sql/query.sql       |  12 +-
 contrib/perfmon/src/gpsmon/gpsmon.c | 324 ++++++++++++++--------------
 4 files changed, 194 insertions(+), 192 deletions(-)

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index c385244df56..b64484917e1 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -1,7 +1,7 @@
 NAME = perfmon
 EXTVERSION = 1.1.0
 
-REGRESS = pre_run_check guc_config extension_test pg_qs post_run
+REGRESS = pre_run_check guc_config query extension_test pg_qs post_run
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/perfmon/expected/query.out b/contrib/perfmon/expected/query.out
index 6c1eed4146d..459bbfa959d 100644
--- a/contrib/perfmon/expected/query.out
+++ b/contrib/perfmon/expected/query.out
@@ -34,24 +34,20 @@ select sess_id from pg_stat_activity where pg_backend_pid()=pid;
 (1 row)
 
 \gset
--- end_ignore
-CREATE TABLE foo(a int);
+CREATE TABLE foo(a int, b int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 CREATE TABLE test(a int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Cloudberry Database data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
-INSERT INTO foo SELECT generate_series(0,40000000);
-INSERT INTO test SELECT generate_series(0,40000000);
+\timing
+INSERT INTO foo SELECT i + 1 from generate_series(0,80000000) as i;
+Time: 149935.380 ms (02:29.935)
+\timing
+-- end_ignore
 -- test query text in multiple lines
 INSERT INTO test
 SELECT generate_series(0,10);
-select count(*) from foo,test where foo.a=test.a;
-  count   
-----------
- 40000012
-(1 row)
-
 -- test nested query
 create or replace function n_join_foo_test() returns integer as $$
 begin
@@ -61,7 +57,7 @@ $$ language plpgsql;
 select * from n_join_foo_test();
  n_join_foo_test 
 -----------------
-        40000012
+              10
 (1 row)
 
 DROP TABLE foo;
@@ -117,29 +113,27 @@ select count(*) > 0 from diskspace_history;
 
 select ccnt, status, query_text, length(query_plan) > 0 from queries_history
 where ssid = :sess_id order by ccnt;
- ccnt | status |                            query_text                            | ?column? 
-------+--------+------------------------------------------------------------------+----------
-    2 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
-    4 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid; | t
-    8 | done   | INSERT INTO foo SELECT generate_series(0,40000000);              | t
-   10 | done   | INSERT INTO test SELECT generate_series(0,40000000);             | t
-   12 | done   | INSERT INTO test                                                +| t
-      |        | SELECT generate_series(0,10);                                    | 
-   14 | done   | select count(*) from foo,test where foo.a=test.a;                | t
-   17 | done   | select * from n_join_foo_test();                                 | t
-(7 rows)
+ ccnt | status |                             query_text                              | ?column? 
+------+--------+---------------------------------------------------------------------+----------
+    2 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid;    | t
+    4 | done   | select sess_id from pg_stat_activity where pg_backend_pid()=pid;    | t
+    8 | done   | INSERT INTO foo SELECT i + 1 from generate_series(0,80000000) as i; | t
+   10 | done   | INSERT INTO test                                                   +| t
+      |        | SELECT generate_series(0,10);                                       | 
+   13 | done   | select * from n_join_foo_test();                                    | t
+(5 rows)
 
 SELECT COUNT(*) FROM (SELECT DISTINCT ccnt FROM queries_history
 where ssid = :sess_id) as temp;
  count 
 -------
-     7
+     5
 (1 row)
 
 select mem_peak>0, cpu_currpct>0, spill_file_size>0, skew_cpu>0, status, query_text, length(query_plan) > 0 from queries_history
-where ssid = :sess_id and query_text = 'select count(*) from foo,test where foo.a=test.a;'
- ?column? | ?column? | ?column? | ?column? | status |                    query_text                     | ?column? 
-----------+----------+----------+----------+--------+---------------------------------------------------+----------
- t        | t        | t        | t        | done   | select count(*) from foo,test where foo.a=test.a; | t
+where ssid = :sess_id and query_text = 'INSERT INTO foo SELECT i + 1 from generate_series(0,80000000) as i;'
+ ?column? | ?column? | ?column? | ?column? | status |                             query_text                              | ?column? 
+----------+----------+----------+----------+--------+---------------------------------------------------------------------+----------
+ t        | t        | t        | t        | done   | INSERT INTO foo SELECT i + 1 from generate_series(0,80000000) as i; | t
 (1 row)
 
diff --git a/contrib/perfmon/sql/query.sql b/contrib/perfmon/sql/query.sql
index d3410f0260b..a835c42ab28 100644
--- a/contrib/perfmon/sql/query.sql
+++ b/contrib/perfmon/sql/query.sql
@@ -24,16 +24,16 @@ select wait_for_gpmmon_work();
 \c contrib_regression
 select sess_id from pg_stat_activity where pg_backend_pid()=pid;
 \gset
+CREATE TABLE foo(a int, b int);
+CREATE TABLE test(a int);
+\timing
+INSERT INTO foo SELECT i + 1 from generate_series(0,80000000) as i;
+\timing
 -- end_ignore
 
-CREATE TABLE foo(a int);
-CREATE TABLE test(a int);
-INSERT INTO foo SELECT generate_series(0,40000000);
-INSERT INTO test SELECT generate_series(0,40000000);
 -- test query text in multiple lines
 INSERT INTO test
 SELECT generate_series(0,10);
-select count(*) from foo,test where foo.a=test.a;
 -- test nested query
 create or replace function n_join_foo_test() returns integer as $$
 begin
@@ -67,4 +67,4 @@ SELECT COUNT(*) FROM (SELECT DISTINCT ccnt FROM queries_history
 where ssid = :sess_id) as temp;
 
 select mem_peak>0, cpu_currpct>0, spill_file_size>0, skew_cpu>0, status, query_text, length(query_plan) > 0 from queries_history
-where ssid = :sess_id and query_text = 'select count(*) from foo,test where foo.a=test.a;'
+where ssid = :sess_id and query_text = 'INSERT INTO foo SELECT i + 1 from generate_series(0,80000000) as i;'
diff --git a/contrib/perfmon/src/gpsmon/gpsmon.c b/contrib/perfmon/src/gpsmon/gpsmon.c
index 2abb4849af3..6da16dea631 100644
--- a/contrib/perfmon/src/gpsmon/gpsmon.c
+++ b/contrib/perfmon/src/gpsmon/gpsmon.c
@@ -24,8 +24,6 @@
 
 void update_log_filename(void);
 void gx_main(int, apr_int64_t);
-static void agg_metrics(apr_hash_t* pidtab, apr_hash_t* query_cpu_table, apr_hash_t* querysegtab, apr_pool_t* oldpool);
-
 /* Temporary global memory to store the qexec line for a send*/
 char	qexec_smon_temp_line[QEXEC_MAX_ROW_BUF_SIZE];
 
@@ -63,9 +61,8 @@ struct pidrec_t
 	gpmon_proc_metrics_t p_metrics;
 	apr_uint64_t cpu_elapsed;
 	gpmon_qlogkey_t query_key;
-        gpmon_query_seginfo_key_t qseg_key;
-        gpmon_qexec_hash_key_t hash_key;
-        gpmon_pidtable_key_t  pidtable_key;
+	gpmon_query_seginfo_key_t qseg_key;
+	gpmon_qexec_hash_key_t hash_key;
 };
 
 typedef struct gx_t gx_t;
@@ -165,7 +162,10 @@ void update_log_filename()
 
 static void gx_accept(SOCKET sock, short event, void* arg);
 static void gx_recvfrom(SOCKET sock, short event, void* arg);
-//static apr_uint32_t create_qexec_packet(const gpmon_qexec_t* qexec, gp_smon_to_mmon_packet_t* pkt);
+static void agg_one_query_metrics(pidrec_t *totalMetric, pidrec_t *pidrec);
+static void agg_metrics(apr_hash_t *pidtab, apr_hash_t *query_perf_metrics_table,
+                        apr_hash_t *querysegtab, apr_pool_t *oldpool);
+static void refresh_proc_metrics(pidrec_t *rec);
 
 /**
  * helper function to copy the union packet from a gpmon_packet_t to a gp_smon_to_mmon_packet_t
@@ -254,91 +254,51 @@ static void send_smon_to_mon_pkt(SOCKET sock, gp_smon_to_mmon_packet_t* pkt)
 	TR2(("Sent packet of type %d to mmon\n", pkt->header.pkttype));
 }
 
-static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt)
+static void refresh_proc_metrics(pidrec_t* rec)
 {
+	apr_int32_t tmid;
+	apr_int32_t ssid;
+	apr_int32_t ccnt;
 	apr_int32_t status;
 	sigar_proc_cpu_t cpu;
 	sigar_proc_mem_t mem;
 	sigar_mem_t system_mem;
 	sigar_proc_fd_t fd;
-	pidrec_t* rec;
-	apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
 
-        gpmon_pidtable_key_t pidtable_key;
-        pidtable_key.pid = key.pid;
-        pidtable_key.qkey.tmid = tmid;
-        pidtable_key.qkey.ssid = ssid;
-        pidtable_key.qkey.ccnt = ccnt;
-	rec = apr_hash_get(gx.pidtab, &pidtable_key, sizeof(pidtable_key));
-        if (rec && rec->updated_tick == gx.tick)
-		return; /* updated in current cycle */
+	if (rec == NULL)
+		return;
+	tmid = rec->query_key.tmid;
+	ssid = rec->query_key.ssid;
+	ccnt = rec->query_key.ccnt;
 	memset(&cpu, 0, sizeof(cpu));
 	memset(&mem, 0, sizeof(mem));
 	memset(&fd, 0, sizeof(fd));
 
-	if (!rec)
-	{
-		sigar_proc_exe_t exe;
-
-		/* There might be cases where the pid no longer exist, so we'll just
-		 * zero out the memory first before doing anything */
-		rec = apr_pcalloc(pool, sizeof(*rec));
-		CHECKMEM(rec);
-
-                gpmon_query_seginfo_key_t	qseg_key;
-                qseg_key.qkey.tmid = tmid;
-                qseg_key.qkey.ssid = ssid;
-                qseg_key.qkey.ccnt = ccnt;
-                qseg_key.segid = key.segid;
-
-
-		rec->pid = key.pid;
-		rec->query_key.tmid = tmid;
-		rec->query_key.ssid = ssid;
-		rec->query_key.ccnt = ccnt;
-                rec->qseg_key = qseg_key;
-                rec->hash_key = key;
-
-                rec->pidtable_key = pidtable_key;
-
-		rec->pname = rec->cwd = 0;
-		if (0 == sigar_proc_exe_get(gx.sigar, key.pid, &exe))
-		{
-			rec->pname = apr_pstrdup(pool, exe.name);
-			rec->cwd = apr_pstrdup(pool, exe.root);
-		}
-		if (!rec->pname)
-			rec->pname = "unknown";
-		if (!rec->cwd)
-			rec->cwd = "unknown";
-
-		apr_hash_set(gx.pidtab, &rec->pidtable_key, sizeof(rec->pidtable_key), rec);
-	}
 
-	status = sigar_proc_mem_get(gx.sigar, key.pid, &mem);
+	status = sigar_proc_mem_get(gx.sigar, rec->pid, &mem);
 	/* ESRCH is error 3: (No such process) */
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, rec->pid));
 		}
 		return;
 	}
 
-	status = sigar_proc_cpu_get(gx.sigar, key.pid, &cpu);
+	status = sigar_proc_cpu_get(gx.sigar, rec->pid, &cpu);
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, rec->pid));
 		}
 		return;
 	}
 
-	status = sigar_proc_fd_get(gx.sigar, key.pid, &fd);
+	status = sigar_proc_fd_get(gx.sigar, rec->pid, &fd);
 	if (status != SIGAR_OK)
 	{
 		if (status != ESRCH) {
-			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, rec->pid));
 		}
 		return;
 	}
@@ -348,7 +308,7 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 	{
 		if (status != ESRCH)
                 {
-			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, key.pid));
+			TR2(("[WARNING] %s. query id(%d-%d-%d) PID: %d\n", sigar_strerror(gx.sigar, status), tmid, ssid, ccnt, rec->pid));
 		}
 		return;
 	}
@@ -366,7 +326,54 @@ static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_in
 #endif
 
 	rec->cpu_elapsed = cpu.total;
-        TR2(("%s:---pid(%d) query id(%d-%d-%d) metrics refresh finished---\n", FLINE, key.pid, tmid, ssid, ccnt));
+	TR2(("%s:---pid(%d) query id(%d-%d-%d) metrics refresh finished---\n", FLINE, rec->pid, tmid, ssid, ccnt));
+}
+
+static void get_pid_metrics(gpmon_qexec_hash_key_t key, apr_int32_t tmid, apr_int32_t ssid, apr_int32_t ccnt)
+{
+	pidrec_t* rec;
+	apr_pool_t* pool = apr_hash_pool_get(gx.pidtab);
+
+	rec = apr_hash_get(gx.pidtab, &key.pid, sizeof(key.pid));
+	if (rec && rec->updated_tick == gx.tick)
+		return; /* updated in current cycle */
+
+	if (!rec)
+	{
+		sigar_proc_exe_t exe;
+
+		/* There might be cases where the pid no longer exist, so we'll just
+		 * zero out the memory first before doing anything */
+		rec = apr_pcalloc(pool, sizeof(*rec));
+		CHECKMEM(rec);
+
+		gpmon_query_seginfo_key_t qseg_key;
+		qseg_key.qkey.tmid = tmid;
+		qseg_key.qkey.ssid = ssid;
+		qseg_key.qkey.ccnt = ccnt;
+		qseg_key.segid = key.segid;
+
+		rec->pid = key.pid;
+		rec->query_key.tmid = tmid;
+		rec->query_key.ssid = ssid;
+		rec->query_key.ccnt = ccnt;
+		rec->qseg_key = qseg_key;
+		rec->hash_key = key;
+
+		rec->pname = rec->cwd = 0;
+		if (0 == sigar_proc_exe_get(gx.sigar, key.pid, &exe))
+		{
+			rec->pname = apr_pstrdup(pool, exe.name);
+			rec->cwd = apr_pstrdup(pool, exe.root);
+		}
+		if (!rec->pname)
+			rec->pname = "unknown";
+		if (!rec->cwd)
+			rec->cwd = "unknown";
+
+		apr_hash_set(gx.pidtab, &rec->pid, sizeof(rec->pid), rec);
+	}
+	refresh_proc_metrics(rec);
 }
 
 
@@ -670,7 +677,7 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 	apr_pool_t* oldpool;
 	apr_hash_t* qdtab;
 	apr_hash_t* pidtab;
-        apr_hash_t* querysegtab;
+	apr_hash_t *querysegtab;
 	if (event & EV_TIMEOUT) // didn't get command from gpmmon, quit
 	{
 		if(gx.tcp_sock)
@@ -732,7 +739,7 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		gp_smon_to_mmon_packet_t localPacketObject;
 		pidrec_t* pidrec;
 		int count = 0;
-		apr_hash_t* query_cpu_table = NULL;
+		apr_hash_t* query_perf_metrics_table = NULL;
 
 		for (hi = apr_hash_first(0, qdtab); hi; hi = apr_hash_next(hi))
 		{
@@ -749,11 +756,11 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		}
 
 		// calculate CPU utilization And Memory utilization per query for this machine
-		query_cpu_table = apr_hash_make(oldpool);
-		CHECKMEM(query_cpu_table);
+		query_perf_metrics_table = apr_hash_make(oldpool);
+		CHECKMEM(query_perf_metrics_table);
 
-                // Iterate the PID's hash table 'pidtab' to update query_cpu hash table and query_seg hash table
-                agg_metrics(pidtab, query_cpu_table, querysegtab, oldpool);
+		// Iterate the PID's hash table 'pidtab' to update query_cpu hash table and query_seg hash table
+		agg_metrics(pidtab, query_perf_metrics_table, querysegtab, oldpool);
 
 		/*
 		 * QUERYSEG packets must be sent after QLOG packets so that gpmmon can
@@ -782,8 +789,8 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 		strncpy(ppkt->u.qlog.user, gx.hostname, sizeof(ppkt->u.qlog.user) - 1);
 		ppkt->u.qlog.user[sizeof(ppkt->u.qlog.user) - 1] = 0;
 
-		// loop through the query per cpu table and send the metrics
-		for (hi = apr_hash_first(0, query_cpu_table); hi; hi = apr_hash_next(hi))
+		// loop through the query_perf_metrics_table and send the metrics
+		for (hi = apr_hash_first(0, query_perf_metrics_table); hi; hi = apr_hash_next(hi))
 		{
 			void* vptr;
 			apr_hash_this(hi, 0, 0, &vptr);
@@ -811,98 +818,102 @@ static void gx_gettcpcmd(SOCKET sock, short event, void* arg)
 	/* get rid of the old pool */
 	{
 		apr_pool_destroy(oldpool);
-                qdtab = NULL;
-                pidtab = NULL;
-                querysegtab = NULL;
+		qdtab = NULL;
+		pidtab = NULL;
+		querysegtab = NULL;
 	}
 	struct timeval tv;
 	tv.tv_sec = opt.terminate_timeout;
 	tv.tv_usec = 0;
-	if (event_add(&gx.tcp_event, opt.terminate_timeout ? &tv : NULL)) //reset timeout
-        {
+	if (event_add(&gx.tcp_event, opt.terminate_timeout ? &tv : NULL)) // reset timeout
+	{
 		gpmon_warningx(FLINE, APR_FROM_OS_ERROR(errno), "event_add failed");
-        }
+	}
 	return;
 }
 
-static void agg_metrics(apr_hash_t* pidtab, apr_hash_t* query_cpu_table, apr_hash_t* querysegtab, apr_pool_t* oldpool){
-        sigar_proc_state_t state;
-        apr_hash_index_t* hi;
-        for (hi = apr_hash_first(0, pidtab); hi; hi = apr_hash_next(hi))
-        {
-                void* vptr;
-                pidrec_t* queryMetric;
-                pidrec_t *pidrec;
-
-                apr_hash_this(hi, 0, 0, &vptr);
-                pidrec = vptr;
-                if (!pidrec)
-                {
-                        continue;
-                }
+static void agg_one_query_metrics(pidrec_t *totalMetric, pidrec_t *pidrec)
+{
+	totalMetric->cpu_elapsed += pidrec->cpu_elapsed;
+	totalMetric->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
+	totalMetric->p_metrics.fd_cnt += pidrec->p_metrics.fd_cnt;
+	totalMetric->p_metrics.mem.resident += pidrec->p_metrics.mem.resident;
+	totalMetric->p_metrics.mem.size += pidrec->p_metrics.mem.size;
+	totalMetric->p_metrics.mem.share += pidrec->p_metrics.mem.share;
+	TR2(("%s: increase %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
+		 FLINE, totalMetric->query_key.tmid, totalMetric->query_key.ssid, totalMetric->query_key.ccnt, totalMetric->pid,
+		 totalMetric->cpu_elapsed, totalMetric->p_metrics.cpu_pct, totalMetric->p_metrics.mem.resident));
+}
 
-                TR2(("%s: %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
-                        FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
-                        pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct, pidrec->p_metrics.mem.resident));
+static void agg_metrics(apr_hash_t *pidtab, apr_hash_t *query_perf_metrics_table, apr_hash_t *querysegtab, apr_pool_t *oldpool)
+{
+	sigar_proc_state_t state;
+	apr_hash_index_t *hi;
+	for (hi = apr_hash_first(0, pidtab); hi; hi = apr_hash_next(hi))
+	{
+		void *vptr;
+		pidrec_t *queryMetric;
+		pidrec_t *pidrec;
 
-                // add to query_cpu_table，aggregate by tmid-ssid-ccnt
-                queryMetric = apr_hash_get(query_cpu_table, &pidrec->query_key, sizeof(pidrec->query_key));
+		apr_hash_this(hi, 0, 0, &vptr);
+		pidrec = vptr;
+		if (!pidrec)
+		{
+			continue;
+		}
+		// add to new pidtab if process is exist
+		int status = sigar_proc_state_get(gx.sigar, pidrec->pid, &state);
+		if (status != SIGAR_OK)
+		{
+			continue;
+		}
+		else
+		{
+			apr_pool_t *pool = apr_hash_pool_get(gx.pidtab);
+			pidrec_t *newpidrec = apr_palloc(pool, sizeof(*pidrec));
+			memcpy(newpidrec, pidrec, sizeof(*pidrec));
+			apr_hash_set(gx.pidtab, &newpidrec->pid, sizeof(newpidrec->pid), newpidrec);
+			TR2(("%s: %d-%d-%d pid %d add to new pidtab \n",
+				 FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid));
+		}
 
-                if (queryMetric)
-                {
-                        // found other pids with same query key so add the metrics to that
-
-                        queryMetric->cpu_elapsed += pidrec->cpu_elapsed;
-                        queryMetric->p_metrics.cpu_pct += pidrec->p_metrics.cpu_pct;
-                        queryMetric->p_metrics.fd_cnt += pidrec->p_metrics.fd_cnt;
-                        queryMetric->p_metrics.mem.resident += pidrec->p_metrics.mem.resident;
-                        queryMetric->p_metrics.mem.size += pidrec->p_metrics.mem.size;
-                        queryMetric->p_metrics.mem.share += pidrec->p_metrics.mem.share;
-                        TR2(("%s: increase %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
-                                FLINE, queryMetric->query_key.tmid, queryMetric->query_key.ssid, queryMetric->query_key.ccnt, queryMetric->pid,
-                                queryMetric->cpu_elapsed, queryMetric->p_metrics.cpu_pct, queryMetric->p_metrics.mem.resident));
-                }
-                else
-                {
-                        // insert existing pid record into table keyed by query key
-                        queryMetric = apr_palloc(oldpool, sizeof(pidrec_t));
-                        memcpy(queryMetric, pidrec, sizeof(pidrec_t));
-                        apr_hash_set(query_cpu_table, &queryMetric->query_key, sizeof(gpmon_qlogkey_t), queryMetric);
-                }
-
-                // add to queryseg hash table, aggregate by tmid-ssid-ccnt、segid
-                gp_smon_to_mmon_packet_t *rec;
-                rec = apr_hash_get(querysegtab, &pidrec->qseg_key, sizeof(pidrec->qseg_key));
-                if (rec)
-                {
-                        rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
-                }
-                else
-                {
-                        rec = apr_palloc(apr_hash_pool_get(querysegtab), sizeof(gp_smon_to_mmon_packet_t));
-                        CHECKMEM(rec);
-                        gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
-                        rec->u.queryseg.key = pidrec->qseg_key;
-                        rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
-                        apr_hash_set(querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
-                }
-
-                //add to new pidtab if process is exist
-                int status = sigar_proc_state_get(gx.sigar,pidrec->pid, &state);
-                if (status == SIGAR_OK)
-                {
-                        apr_pool_t *pool = apr_hash_pool_get(gx.pidtab);
-                        pidrec_t *newpidrec = apr_palloc(pool, sizeof(*pidrec));
-                        memcpy(newpidrec, pidrec, sizeof(*pidrec));
-                        apr_hash_set(gx.pidtab, &newpidrec->pidtable_key, sizeof(newpidrec->pidtable_key), newpidrec);
-                        TR2(("%s: %d-%d-%d pid %d add to new pidtab \n",
-                                FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid));
-                        continue;
-                }
-                TR2(("%s: %d-%d-%d pid %d pid status %d not add to new pidtab \n",
-                        FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid, status));
-        }
-        return;
+		TR2(("%s: %d-%d-%d pid %d (CPU elapsed %ld CPU Percent %.2f Mem size %lu)\n",
+			 FLINE, pidrec->query_key.tmid, pidrec->query_key.ssid, pidrec->query_key.ccnt, pidrec->pid,
+			 pidrec->cpu_elapsed, pidrec->p_metrics.cpu_pct, pidrec->p_metrics.mem.resident));
+
+		// add to query_perf_metrics_table，aggregate by tmid-ssid-ccnt
+		queryMetric = apr_hash_get(query_perf_metrics_table, &pidrec->query_key, sizeof(pidrec->query_key));
+
+		if (queryMetric)
+		{
+			agg_one_query_metrics(queryMetric, pidrec);
+		}
+		else
+		{
+			// insert existing pid record into query_perf_metrics_table
+			queryMetric = apr_palloc(oldpool, sizeof(pidrec_t));
+			memcpy(queryMetric, pidrec, sizeof(pidrec_t));
+			apr_hash_set(query_perf_metrics_table, &queryMetric->query_key, sizeof(gpmon_qlogkey_t), queryMetric);
+		}
+
+		// add to queryseg hash table, aggregate by tmid-ssid-ccnt、segid
+		gp_smon_to_mmon_packet_t *rec;
+		rec = apr_hash_get(querysegtab, &pidrec->qseg_key, sizeof(pidrec->qseg_key));
+		if (rec)
+		{
+			rec->u.queryseg.sum_cpu_elapsed += pidrec->cpu_elapsed;
+		}
+		else
+		{
+			rec = apr_palloc(apr_hash_pool_get(querysegtab), sizeof(gp_smon_to_mmon_packet_t));
+			CHECKMEM(rec);
+			gp_smon_to_mmon_set_header(rec, GPMON_PKTTYPE_QUERYSEG);
+			rec->u.queryseg.key = pidrec->qseg_key;
+			rec->u.queryseg.sum_cpu_elapsed = pidrec->cpu_elapsed;
+			apr_hash_set(querysegtab, &rec->u.queryseg.key, sizeof(rec->u.queryseg.key), rec);
+		}
+	}
+	return;
 }
 
 static void gx_accept(SOCKET sock, short event, void* arg)
@@ -1549,12 +1560,9 @@ void gx_main(int port, apr_int64_t signature)
 			rec = vptr;
 			if (rec)
 			{
-                                TR2(("%s:----start to refresh pid(%d) query id(%d-%d-%d) metrics in main loop----\n",
-                                        FLINE, rec->pid, rec->query_key.tmid, rec->query_key.ssid, rec->query_key.ccnt));
-				get_pid_metrics(rec->hash_key,
-								rec->query_key.tmid,
-								rec->query_key.ssid,
-								rec->query_key.ccnt);
+				TR2(("%s:----start to refresh pid(%d) query id(%d-%d-%d) metrics in main loop----\n",
+					 FLINE, rec->pid, rec->query_key.tmid, rec->query_key.ssid, rec->query_key.ccnt));
+				refresh_proc_metrics(rec);
 			}
 		}
 

From b28aa2dcd8d765466056f3ae4ce696556f0cee67 Mon Sep 17 00:00:00 2001
From: zhuzhiyong <zhuzhiyong@hashdata.cn>
Date: Thu, 6 Mar 2025 10:06:34 +0800
Subject: [PATCH 38/40] Avoid paths like // in build commands

Fix the issue:
/usr/lib/rpm/debugedit: canonicalization unexpectedly shrank by one character
---
 contrib/perfmon/Makefile            | 2 +-
 contrib/perfmon/src/gpmmon/Makefile | 2 +-
 contrib/perfmon/src/gpmon/Makefile  | 2 +-
 contrib/perfmon/src/gpsmon/Makefile | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/perfmon/Makefile b/contrib/perfmon/Makefile
index b64484917e1..715d9d83ce3 100644
--- a/contrib/perfmon/Makefile
+++ b/contrib/perfmon/Makefile
@@ -8,7 +8,7 @@ PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 else
-top_builddir = ../../
+top_builddir = ../..
 subdir = contrib/perfmon
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
diff --git a/contrib/perfmon/src/gpmmon/Makefile b/contrib/perfmon/src/gpmmon/Makefile
index 30e072435b0..ec0848e1d27 100644
--- a/contrib/perfmon/src/gpmmon/Makefile
+++ b/contrib/perfmon/src/gpmmon/Makefile
@@ -1,4 +1,4 @@
-top_builddir = ../../../../
+top_builddir = ../../../..
 
 MODULE_big = gpmmon
 OBJS = gpmmon.o gpmondb.o gpmon_agg.o  ../common/gpmonlib.o
diff --git a/contrib/perfmon/src/gpmon/Makefile b/contrib/perfmon/src/gpmon/Makefile
index f41f18fd9f6..9f3d8559bb6 100644
--- a/contrib/perfmon/src/gpmon/Makefile
+++ b/contrib/perfmon/src/gpmon/Makefile
@@ -1,4 +1,4 @@
-top_builddir = ../../../../
+top_builddir = ../../../..
 
 MODULE_big = gpmon
 OBJS = gpmon.o pg_query_state.o signal_handler.o
diff --git a/contrib/perfmon/src/gpsmon/Makefile b/contrib/perfmon/src/gpsmon/Makefile
index 8484e3d3eb0..1dd9903ba20 100644
--- a/contrib/perfmon/src/gpsmon/Makefile
+++ b/contrib/perfmon/src/gpsmon/Makefile
@@ -1,4 +1,4 @@
-top_builddir = ../../../../
+top_builddir = ../../../..
 
 PG_CPPFLAGS = -I$(libpq_srcdir) -I../include -I. -Wno-error=vla -Wno-vla
 ifdef USE_PGXS

From dcb2df58ecfe21e59c0b8a1c4497541605ba7756 Mon Sep 17 00:00:00 2001
From: Zhanwei Wang <wang@hashdata.cn>
Date: Tue, 1 Apr 2025 22:33:13 +0800
Subject: [PATCH 39/40] Fix include issue for gpmmon and gpsmon

---
 contrib/perfmon/src/gpmmon/Makefile | 2 +-
 contrib/perfmon/src/gpsmon/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/perfmon/src/gpmmon/Makefile b/contrib/perfmon/src/gpmmon/Makefile
index ec0848e1d27..afc5e8a6cbf 100644
--- a/contrib/perfmon/src/gpmmon/Makefile
+++ b/contrib/perfmon/src/gpmmon/Makefile
@@ -4,7 +4,7 @@ MODULE_big = gpmmon
 OBJS = gpmmon.o gpmondb.o gpmon_agg.o  ../common/gpmonlib.o
 SHLIB_LINK += -levent  -lapr-1 -laprutil-1 -lm
 PG_CFLAGS += -Wno-error=vla -Wno-vla
-PG_CPPFLAGS = -I$(libpq_srcdir) -I../include
+PG_CPPFLAGS = -I$(libpq_srcdir) -I../include $(apr_includes)
 SHLIB_LINK_INTERNAL = -Wl,-Bsymbolic -Wl,-Bstatic -Wl,-Bstatic $(libpq) -lpgcommon_shlib -Wl,-Bdynamic
 
 ifdef USE_PGXS
diff --git a/contrib/perfmon/src/gpsmon/Makefile b/contrib/perfmon/src/gpsmon/Makefile
index 1dd9903ba20..89f49d71ea3 100644
--- a/contrib/perfmon/src/gpsmon/Makefile
+++ b/contrib/perfmon/src/gpsmon/Makefile
@@ -1,6 +1,6 @@
 top_builddir = ../../../..
 
-PG_CPPFLAGS = -I$(libpq_srcdir) -I../include -I. -Wno-error=vla -Wno-vla
+PG_CPPFLAGS = -I$(libpq_srcdir) -I../include -I. $(apr_includes) -Wno-error=vla -Wno-vla -Wno-unused-result
 ifdef USE_PGXS
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)

From 94b8cf3ecfa6dfe305864e4a2b74aa749fcc57da Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Mon, 12 May 2025 14:38:27 +0800
Subject: [PATCH 40/40] Remove unused files

---
 contrib/perfmon/README_hashdata.md | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 contrib/perfmon/README_hashdata.md

diff --git a/contrib/perfmon/README_hashdata.md b/contrib/perfmon/README_hashdata.md
deleted file mode 100644
index 01b44fa43ae..00000000000
--- a/contrib/perfmon/README_hashdata.md
+++ /dev/null
@@ -1,9 +0,0 @@
-1. gp_elog and guc:'gpperfmon_log_alert_level' have been
-removed in hashdata-lightning
-	- disable check_disk_space
-	- disable message_main
-	- disable gpdb_import_alert_log
-2. load gpmon as a shared library
-	- disable parse_command_line.
-	- get opt.port and opt.conf_file by xx
-	- modify the Makefile and gpperfmon_install