1. Nag Arvind Gudiseva
1
HIVE PERFORMANCE OPTIMIZATIONS
SERDE
ORIGINAL
CREATE TABLE IF NOT EXISTS cand_sr.cand_sr_note_nda_detail (RUN_ID BIGINT, GUID STRING,
CIP_COLLECTION_ID BIGINT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY 't'
LINES TERMINATED BY 'n'
STORED AS TEXTFILE;
OPTIMISED
CREATE TABLE IF NOT EXISTS cand_sr.cand_sr_note_nda_detail (RUN_ID BIGINT, GUID STRING,
CIP_COLLECTION_ID BIGINT)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe';
OCTAL
ORIGINAL
CREATE TABLE IF NOT EXISTS cand_sr.cand_sr_note_nda_detail (RUN_ID BIGINT, GUID STRING,
CIP_COLLECTION_ID BIGINT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '<FSP>'
LINES TERMINATED BY '<RSP>'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe';
2. Nag Arvind Gudiseva
2
OPTIMISED
CREATE TABLE employees (name STRING, salary FLOAT, subordinates ARRAY<STRING>, deductions
MAP<STRING, FLOAT>, address STRUCT<street: STRING, city: STRING, state: STRING, zip: INT>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '001'
COLLECTION ITEMS TERMINATED BY '002'
MAP KEYS TERMINATED BY '003'
LINES TERMINATED BY 'n'
STORED AS TEXTFILE;
INTERMEDIATE TABLES, BUCKETING AND COMPRESSION
ORIGINAL
INSERT OVERWRITE TABLE NDA_CNR_DEVICE_JOIN_MAP_FH PARTITION (CIP_COLLECTION_ID,
ELEMENT_TYPE)
SELECT T1.CIP_DEVICE_ID AS D2,
T2.CIP_DEVICE_ID AS D1,
T2.DV_CNT_BY_GRP,
T2.FH_LVL_GRP,
T2.FAMILY,
T2.PLATFORM,
T2.ROLE,
T2.LEVEL_1,
T2.LEVEL_2,
T2.LEVEL_3,
T2.IS_POLICY,
T2.IS_GRP_POLICY,
T2.ELEMENT,
T1.CIP_COLLECTION_ID,
3. Nag Arvind Gudiseva
3
T2.ELEMENT_TYPE
FROM NDA_POLICY_DV_DETAILS_FH T1 JOIN NDA_CNR_DEVICE_ELEMENT_MAP_FH T2
ON (T1.FAMILY = T2.FAMILY
AND T1.PLATFORM = T2.PLATFORM
AND T1.ROLE = T2.ROLE
AND T1.CIP_COLLECTION_ID=T2.CIP_COLLECTION_ID)
WHERE ( T1.CIP_COLLECTION_ID=255 AND T2.ELEMENT_TYPE='FTS')
OPTIMISED
set hive.enforce.bucketing=true
Set hive.enforce.sorting=true
CREATE TABLE IF NOT EXISTS NDA_POLICY_DV_DETAILS_FH_TEMP (
FAMILY STRING,
PLATFORM STRING,
ROLE STRING,
D2 BIGINT,
CIP_COLLECTION_ID BIGINT)
CLUSTERED BY (FAMILY, PLATFORM, ROLE)
SORTED BY (FAMILY, PLATFORM, ROLE)
INTO 50 BUCKETS
STORED AS ORC tblproperties ("orc.compress"="SNAPPY");
INSERT OVERWRITE TABLE NDA_POLICY_DV_DETAILS_FH_TEMP
SELECT T1.FAMILY,
T1.PLATFORM,
T1.ROLE,
T1.CIP_DEVICE_ID AS D2,
4. Nag Arvind Gudiseva
4
T1.CIP_COLLECTION_ID
FROM NDA_POLICY_DV_DETAILS_FH T1
WHERE ( T1.CIP_COLLECTION_ID=‘3159’)
CLUSTER BY T1.FAMILY, T1.PLATFORM, T1.ROLE;
CREATE TABLE IF NOT EXISTS NDA_CNR_DEVICE_ELEMENT_MAP_FH_TEMP (
FAMILY STRING,
PLATFORM STRING,
ROLE STRING,
D1 BIGINT,
DV_CNT_BY_GRP BIGINT,
FH_LVL_GRP STRING,
LEVEL_1 STRING,
LEVEL_2 STRING,
LEVEL_3 STRING,
IS_POLICY STRING,
IS_GRP_POLICY STRING,
ELEMENT STRING,
ELEMENT_TYPE STRING)
CLUSTERED BY (FAMILY, PLATFORM, ROLE)
SORTED BY (FAMILY, PLATFORM, ROLE)
INTO 50 BUCKETS
STORED AS ORC tblproperties ("orc.compress"="SNAPPY");
INSERT OVERWRITE TABLE NDA_CNR_DEVICE_ELEMENT_MAP_FH_TEMP
SELECT T2.FAMILY,
T2.PLATFORM,
T2.ROLE,
T2.CIP_DEVICE_ID AS D1,
5. Nag Arvind Gudiseva
5
T2.DV_CNT_BY_GRP,
T2.FH_LVL_GRP,
T2.LEVEL_1,
T2.LEVEL_2,
T2.LEVEL_3,
T2.IS_POLICY,
T2.IS_GRP_POLICY,
T2.ELEMENT,
T2.ELEMENT_TYPE
FROM NDA_CNR_DEVICE_ELEMENT_MAP_FH T2
WHERE ( T2.CIP_COLLECTION_ID='3159' AND T2.ELEMENT_TYPE='FTS')
CLUSTER BY T2.FAMILY, T2.PLATFORM, T2.ROLE;
CREATE TABLE IF NOT EXISTS NDA_CNR_DEVICE_JOIN_MAP_FH (
D2 BIGINT,
D1 BIGINT,
DV_CNT_BY_GRP BIGINT,
FH_LVL_GRP STRING,
FAMILY STRING,
PLATFORM STRING,
ROLE STRING,
LEVEL_1 STRING,
LEVEL_2 STRING,
LEVEL_3 STRING,
IS_POLICY STRING,
IS_GRP_POLICY STRING,
ELEMENT STRING)
PARTITIONED BY (CIP_COLLECTION_ID BIGINT, ELEMENT_TYPE STRING);
6. Nag Arvind Gudiseva
6
INSERT OVERWRITE TABLE NDA_CNR_DEVICE_JOIN_MAP_FH PARTITION (CIP_COLLECTION_ID,
ELEMENT_TYPE)
SELECT T1.D2,
T2.D1,
T2.DV_CNT_BY_GRP,
T2.FH_LVL_GRP,
T2.FAMILY,
T2.PLATFORM,
T2.ROLE,
T2.LEVEL_1,
T2.LEVEL_2,
T2.LEVEL_3,
T2.IS_POLICY,
T2.IS_GRP_POLICY,
T2.ELEMENT,
T1.CIP_COLLECTION_ID,
T2.ELEMENT_TYPE
FROM NDA_POLICY_DV_DETAILS_FH_TEMP T1 JOIN NDA_CNR_DEVICE_ELEMENT_MAP_FH_TEMP T2
ON (T1.FAMILY = T2.FAMILY
AND T1.PLATFORM = T2.PLATFORM
AND T1.ROLE = T2.ROLE);
SET COMMANDS
Use the below SET commands judiciously whererequired:
Set mapred.max.split.size=10857600;
Set mapred.min.split.size=10857600;
Set mapred.map.tasks=50;
Set mapred.reduce.tasks=100;
7. Nag Arvind Gudiseva
7
Set mapred.job.map.memory.mb=5120;
Set mapred.job.reduce.memory.mb=5120;
Set mapred.map.child.java.opts=-Xmx4096m;
Set mapred.reduce.child.java.opts=-Xmx4096m;
Set io.sort.mb=1000;
Set mapred.reduce.tasks=50;
Set hive.mapred.reduce.tasks.speculative.execution=false;
Set hive.auto.convert.join=false
Set hive.rpc.query.plan=true
OTHER OPTIMIZATIONS
APACHE TEZ
Supported by Hortonworks distribution. Other vendors likeCloudera,MapR, etc. have emphasised more on
Spark. Note: Spark 2.0 is undergoingmajor transformation with syntactical changes.
APACHE PIG
Pigis a data flow scriptinglanguage. It’s best Use Caseis for ETL operations. Use Pig for creating data
pipelines and Hivefor final longrunningbatch queries.