Fix for RM 41527, Issue EnterpriseDB#41

Problem Statement: Hive and Spark both support HiveQL, and are compatible except for the behaviour of the ANALYZE command. The difference is as follows: In Hive, ANALYZE is a utility command and does not return any result set whereas in Spark it returns a result set. For example: In Hive we get this output: -------------------------- 0: jdbc:hive2://localhost:10000/testdb> analyze table names_tab compute statistics; INFO : Number of reduce tasks is set to 0 since there's no reduce operator INFO : number of splits:1 INFO : Submitting tokens for job: job_1488090103001_0007 INFO : The url to track the job: http://localhost:8088/proxy/application_1488090103001_0007/ INFO : Starting Job = job_1488090103001_0007, Tracking URL = http://localhost:8088/proxy/application_1488090103001_0007/ INFO : Kill Command = /home/abbasbutt/Projects/hadoop_fdw/hadoop/bin/hadoop job -kill job_1488090103001_0007 INFO : Hadoop job information for Stage-0: number of mappers: 1; number of reducers: 0 INFO : 2017-08-22 19:08:11,328 Stage-0 map = 0%, reduce = 0% No rows affected (11.949 seconds) INFO : 2017-08-22 19:08:15,465 Stage-0 map = 100%, reduce = 0%, Cumulative CPU 0.93 sec INFO : MapReduce Total cumulative CPU time: 930 msec INFO : Ended Job = job_1488090103001_0007 INFO : Table testdb.names_tab stats: [numFiles=2, numRows=12, totalSize=76, rawDataSize=64] 0: jdbc:hive2://localhost:10000/testdb> [abbasbutt@localhost bin]$ In Spark we get this output: --------------------------- 0: jdbc:hive2://localhost:10000/my_spark_db> analyze table junk_table compute statistics; +---------+--+ | Result | +---------+--+ +---------+--+ No rows selected (1.462 seconds) Solution: The CREATE SERVER command already has a client_type parameter that currently supports one value 'hiveserver2'. To support ANALYZE on Spark the client type can also have the value 'spark'. If the client_type is not specified the default will be hive and analyze command will fail when Spark is used. Otherwise if correct client_type is specified ANALYZE will work fine with Spark. For Example: postgres=# CREATE EXTENSION hdfs_fdw; CREATE EXTENSION postgres=# CREATE SERVER hdfs_svr FOREIGN DATA WRAPPER hdfs_fdw OPTIONS (host '127.0.0.1',port '10000',client_type 'spark'); CREATE SERVER postgres=# CREATE USER MAPPING FOR abbasbutt server hdfs_svr OPTIONS (username 'ldapadm', password 'ldapadm'); CREATE USER MAPPING postgres=# CREATE FOREIGN TABLE fnt( a int, name varchar(255)) SERVER hdfs_svr OPTIONS (dbname 'my_spark_db', table_name 'junk_table'); CREATE FOREIGN TABLE postgres=# ANALYZE fnt; ANALYZE
gabbasb · Aug 23, 2017 · 917bc84 · 917bc84
1 parent 6a6dc6d
commit 917bc84
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 14 deletions.
diff --git a/hdfs_fdw.h b/hdfs_fdw.h
@@ -47,11 +47,6 @@ static const char* DEFAULT_HOST     = "localhost";
 static const char* DEFAULT_PORT     = "10000";
 
 
-typedef enum CLIENT_TYPE
-{
-	HIVESERVER1,
-	HIVESERVER2
-} CLIENT_TYPE;
 
 typedef struct hdfs_col
 {

diff --git a/hdfs_option.c b/hdfs_option.c
@@ -218,10 +218,17 @@ hdfs_get_options(Oid foreigntableid)
 			if (strcasecmp(defGetString(def), "hiveserver2") == 0)
 				opt->client_type = HIVESERVER2;
 			else
-				ereport(ERROR,
-					(errcode(ERRCODE_FDW_INVALID_OPTION_NAME), 
-						errmsg("invalid option \"%s\"", defGetString(def)), 
-							errhint("Valid client_type is hiveserver2, this option will be deprecated soon")));
+			{
+				if (strcasecmp(defGetString(def), "spark") == 0)
+					opt->client_type = SPARKSERVER;
+				else
+				{
+					ereport(ERROR,
+						(errcode(ERRCODE_FDW_INVALID_OPTION_NAME),
+							errmsg("invalid option \"%s\"", defGetString(def)),
+								errhint("Valid client_type values are hiveserver2 and spark")));
+				}
+			}
 		}
 
 		if (strcmp(def->defname, "auth_type") == 0)

diff --git a/hdfs_query.c b/hdfs_query.c
@@ -50,7 +50,10 @@ hdfs_analyze(int con_index, hdfs_opt *opt)
 
 	initStringInfo(&sql);
 	hdfs_deparse_analyze(&sql, opt);
-	hdfs_query_execute_utility(con_index, opt, sql.data);
+	if (opt->client_type == SPARKSERVER)
+		hdfs_query_execute(con_index, opt, sql.data);
+	else
+		hdfs_query_execute_utility(con_index, opt, sql.data);
 	hdfs_close_result_set(con_index, opt);
 }
 

diff --git a/libhive/jdbc/hiveclient.h b/libhive/jdbc/hiveclient.h
@@ -33,10 +33,11 @@
 extern "C" {
 #endif // __cplusplus
 
-typedef enum HIVE_SERVER_TYPE {
-  HIVE_SERVER1 =  0,
-  HIVE_SERVER2 =  1
-} HIVE_SERVER_TYPE;
+typedef enum CLIENT_TYPE
+{
+	HIVESERVER2 = 0,
+	SPARKSERVER
+} CLIENT_TYPE;
 
 typedef enum AUTH_TYPE
 {