📚 데이터베이스/빅데이터

[Hadoop/Hive] installation/configuration 하둡 하이브 설치 방법

써니(>_<) 2022. 8. 2. 05:05

1. Download Hadoop files

2. Update necessary config files

3. Download Hive files

4. Update Hive config file

5. Install Hive metastore 

 

/* Update the system and install Java */
sudo apt update

sudo apt install openjdk-8-jdk -y

java -version; javac -version


/* Install open SSH  */
sudo apt install openssh-server openssh-client -y

ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa

cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

chmod 0600 ~/.ssh/authorized_keys

ssh localhost

/* Install Hadoop */


wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz -P /disk/hadoop

cd /disk/hadoop

tar xzf hadoop-3.3.1.tar.gz

--/disk/hadoop/hadoop-3.3.1

cd ~

sudo nano .bashrc

#Hadoop Related Options append at teh end of .bashrc file
export HADOOP_HOME=/disk/hadoop/hadoop-3.3.1
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/nativ"


source ~/.bashrc


which javac
--/usr/bin/javac

readlink -f /usr/bin/javac
--/usr/lib/jvm/java-8-openjdk-amd64/bin/javac

sudo nano $HADOOP_HOME/etc/hadoop/hadoop-env.sh
-- Update 
	export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64

sudo nano $HADOOP_HOME/etc/hadoop/core-site.xml

<configuration>
<property>
  <name>hadoop.tmp.dir</name>
  <value>/disk/ubuntuhive/tmpdata</value>
</property>
<property>
  <name>fs.default.name</name>
  <value>hdfs://127.0.0.1:9000</value>
</property>
</configuration>

mkdir -p /disk/ubuntuhive/tmpdata
mkdir -p /disk/ubuntuhive/dfsdata/namenode
mkdir -p /disk/ubuntuhive/dfsdata/datanode


sudo nano $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<configuration>
<property>
  <name>dfs.data.dir</name>
  <value>/disk/ubuntuhive/dfsdata/namenode</value>
</property>
<property>
  <name>dfs.data.dir</name>
  <value>/disk/ubuntuhive/dfsdata/datanode</value>
</property>
<property>
  <name>dfs.replication</name>
  <value>1</value>
</property>
</configuration>


sudo nano $HADOOP_HOME/etc/hadoop/mapred-site.xml
<configuration> 
<property> 
  <name>mapreduce.framework.name</name> 
  <value>yarn</value> 
</property> 
</configuration>


sudo nano $HADOOP_HOME/etc/hadoop/yarn-site.xml

<configuration>
<property>
  <name>yarn.nodemanager.aux-services</name>
  <value>mapreduce_shuffle</value>
</property>
<property>
  <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
  <value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
  <name>yarn.resourcemanager.hostname</name>
  <value>127.0.0.1</value>
</property>
<property>
  <name>yarn.acl.enable</name>
  <value>0</value>
</property>
<property>
  <name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PERPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>

/* the below command need to be executed only once while installing Hadoop */
hdfs namenode -format

/* the beow commands needs to be executed when ever you dont see all the processes in jps command. usually after restarting the VM's*/
cd /disk/hadoop/hadoop-3.3.1/sbin/
 ./start-dfs.sh
 ./start-yarn.sh

jps
/* Install Hive */
cd ~
wget https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz -P /disk/hive

cd /disk/hive

tar xzf apache-hive-3.1.2-bin.tar.gz

cd ~

sudo nano .bashrc

export HIVE_HOME="/disk/hive/apache-hive-3.1.2-bin"
export PATH=$PATH:$HIVE_HOME/bin

cd ~
source ~/.bashrc



sudo nano $HIVE_HOME/bin/hive-config.sh
export HADOOP_HOME=/disk/hadoop/hadoop-3.3.1


hdfs dfs -mkdir /tmp
hdfs dfs -chmod g+w /tmp
hdfs dfs -ls /

hdfs dfs -mkdir -p /user/hive/warehouse
hdfs dfs -chmod g+w /user/hive/warehouse
hdfs dfs -ls /user/hive

cd $HIVE_HOME/conf
cp hive-default.xml.template hive-site.xml


cd /disk/hive/apache-hive-3.1.2-bin/conf
sudo nano hive-site.xml
<property><name>system:java.io.tmpdir</name><value>/tmp/hive/java</value></property>
<property><name>system:user.name</name><value>${user.name}</value></property>

check for text :
Ensures commands with OVERWRITE (such as INSERT OVERWRITE) acquire Exclusive locks for[remove this] transactional tables.  This ensures that inserts (w/o overwrite) running concurrently
  are not hidden by the INSERT OVERWRITE.

check for 
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
  <description>
    Enforce metastore schema version consistency.
  True: Verify that version information stored in is compatible with one from Hive jars.  Also disable automatic
 schema migration attempt. Users are required to manually migrate schema after Hive upgrade which ensures
 proper metastore schema migration. (Default)
    False: Warn if the version information stored in metastore doesn't match with one from in Hive jars.
  </description>
</property>





cd $HIVE_HOME
$HIVE_HOME/bin/schematool -dbType derby -initSchema
hive


cd $HIVE_HOME/bin;hive

jps



/* Setup for beeline */
sudo nano $HADOOP_HOME/etc/hadoop/core-site.xml
<property>
     <name>hadoop.proxyuser.ubuntuhive.hosts</name>
     <value>*</value>
</property>
<property>
     <name>hadoop.proxyuser.ubuntuhive.groups</name>
     <value>*</value>
</property>

cd /disk/hadoop/hadoop-3.3.1/sbin/
./stop-dfs.sh
./stop-yarn.sh

./start-dfs.sh
./start-yarn.sh

cd $HIVE_HOME/bin;
hiveserver2


beeline -u jdbc:hive2://localhost:10000 -n ubuntuhive