ubuntu安装配置hadoop和hive-数据分析

ubuntu安装配置hadoop和hive

1、下载jdk并配置环境变量
http://www.oracle.com/technetwork/java/javase/downloads/index.html

ls
-rwxr–r– 1 lpxuan lpxuan 85141056 2011-06-21 18:44 jdk-6u26-linux-i586.bin*
-rw-r–r– 1 lpxuan lpxuan 6599796 2011-06-21 18:42 pig-0.1.1.tar.gz
-rw-r–r– 1 lpxuan lpxuan 4031810 2011-06-21 18:42 zookeeper-3.0.1.tar.gz

–赋予执行权限
chmod u+x jdk-6u26-linux-i586.bin

安装
./jdk-6u26-linux-i586.bin

sudo -i

root@hadoop1:/usr# cd java
root@hadoop1:/usr/java# ls
jdk1.6.0_26
root@hadoop1:/usr/java# cd jdk1.6.0_26/
root@hadoop1:/usr/java/jdk1.6.0_26# ls
bin        include man               register_zh_CN.html
COPYRIGHT jre      README.html       sample
db         lib      register.html     src.zip
demo       LICENSE register_ja.html THIRDPARTYLICENSEREADME.txt

–set the jdk environment
/usr/java/jdk1.6.0_26/bin

root@hadoop1:/etc# cat environment
PATH=”/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/java/jdk1.6.0_26/bin”
export PATH

–install success
root@hadoop1:/usr/java/jdk1.6.0_26/bin# java -version
java version “1.6.0_22″
OpenJDK Runtime Environment (IcedTea6 1.10.2) (6b22-1.10.2-0ubuntu1~11.04.1)
OpenJDK Client VM (build 20.0-b11, mixed mode, sharing)

2、Hadoop下载并配置环境变量

http://download.huihoo.com/apache/hadoop/20081222-223.html

root@hadoop1:/home/lpxuan/下载# ll
总用量 143352
drwxr-xr-x 3 lpxuan lpxuan     4096 2011-06-22 18:10 ./
drwxr-xr-x 24 lpxuan lpxuan     4096 2011-06-21 18:31 ../
-rw-r–r– 1 lpxuan lpxuan 42266180 2011-06-21 18:44 hadoop-0.19.0.tar.gz
-rw-r–r– 1 lpxuan lpxuan 8716797 2011-06-21 18:42 hbase-0.2.1.tar.gz
drwxr-xr-x 10 lpxuan lpxuan     4096 2011-06-22 18:10 jdk1.6.0_26/
-rwxr–r– 1 lpxuan lpxuan 85141056 2011-06-21 18:44 jdk-6u26-linux-i586.bin*
-rw-r–r– 1 lpxuan lpxuan 6599796 2011-06-21 18:42 pig-0.1.1.tar.gz
-rw-r–r– 1 lpxuan lpxuan 4031810 2011-06-21 18:42 zookeeper-3.0.1.tar.gz

root@hadoop1:/home/lpxuan/下载# tar zxvf hadoop-0.19.0.tar.gz

root@hadoop1:/home/lpxuan/下载# mv hadoop-0.19.0 /opt/

root@hadoop1:/opt# ll
总用量 12
drwxr-xr-x 3 root root 4096 2011-06-22 18:41 ./
drwxr-xr-x 22 root root 4096 2011-06-22 02:02 ../
drwxr-xr-x 12 root root 4096 2008-11-14 11:14 hadoop-0.19.0/

root@hadoop1:/opt/hadoop-0.19.0# ll
总用量 4148
drwxr-xr-x 12 root root    4096 2008-11-14 11:14 ./
drwxr-xr-x 3 root root    4096 2011-06-22 18:41 ../
drwxr-xr-x 2 root root    4096 2011-06-22 18:39 bin/
-rw-rw-r– 1 root root   57430 2008-11-14 11:09 build.xml
drwxr-xr-x 4 root root    4096 2008-11-14 11:14 c++/
-rw-rw-r– 1 root root 287046 2008-11-14 11:09 CHANGES.txt
drwxr-xr-x 2 root root    4096 2011-06-22 18:39 conf/
drwxr-xr-x 12 root root    4096 2008-11-14 11:09 contrib/
drwxr-xr-x 6 root root    4096 2011-06-22 18:39 docs/
-rw-rw-r– 1 root root    6839 2008-11-14 11:09 hadoop-0.19.0-ant.jar
-rw-rw-r– 1 root root 2370110 2008-11-14 11:14 hadoop-0.19.0-core.jar
-rw-rw-r– 1 root root 134119 2008-11-14 11:09 hadoop-0.19.0-examples.jar
-rw-rw-r– 1 root root 1256675 2008-11-14 11:14 hadoop-0.19.0-test.jar
-rw-rw-r– 1 root root   52295 2008-11-14 11:09 hadoop-0.19.0-tools.jar
drwxr-xr-x 4 root root    4096 2011-06-22 18:39 lib/
drwxr-xr-x 3 root root    4096 2011-06-22 18:39 libhdfs/
drwxr-xr-x 2 root root    4096 2011-06-22 18:39 librecordio/
-rw-rw-r– 1 root root   11358 2008-11-14 11:09 LICENSE.txt
-rw-rw-r– 1 root root     101 2008-11-14 11:09 NOTICE.txt
-rw-rw-r– 1 root root    1366 2008-11-14 11:09 README.txt
drwxr-xr-x 15 root root    4096 2011-06-22 18:39 src/
drwxr-xr-x 8 root root    4096 2008-11-14 11:09 webapps/

root@hadoop1:/opt/hadoop-0.19.0# ln -sf /opt/hadoop-0.19.0/ /opt/hadoop

root@hadoop1:/opt/hadoop/conf# vi hadoop-env.sh
# Set Hadoop-specific environment variables here.

# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.

# The java implementation to use. Required.
# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
export JAVA_HOME=/urs/java/jdk1.6.0_26

# Extra Java CLASSPATH elements. Optional.
# export HADOOP_CLASSPATH=
export HADOOP_HOME=/opt/hadoop
export PATH=$PATH:/opt/hadoop/bin

# The maximum amount of heap to use, in MB. Default is 1000.
# export HADOOP_HEAPSIZE=2000

# Extra Java runtime options. Empty by default.
# export HADOOP_OPTS=-server

# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS=”-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS”
export HADOOP_SECONDARYNAMENODE_OPTS=”-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS”
export HADOOP_DATANODE_OPTS=”-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS”
export HADOOP_BALANCER_OPTS=”-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS”
export HADOOP_JOBTRACKER_OPTS=”-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS”
# export HADOOP_TASKTRACKER_OPTS=
# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
# export HADOOP_CLIENT_OPTS

# Extra ssh options. Empty by default.
# export HADOOP_SSH_OPTS=”-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR”

# Where log files are stored. $HADOOP_HOME/logs by default.
# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs

# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default.
# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves

# host:path where hadoop code should be rsync’d from. Unset by default.
# export HADOOP_MASTER=master:/home/$USER/src/hadoop

# Seconds to sleep between slave commands. Unset by default. This
# can be useful in large clusters, where, e.g., slave rsyncs can
# otherwise arrive faster than the master can service them.
# export HADOOP_SLAVE_SLEEP=0.1

# The directory where pid files are stored. /tmp by default.
# export HADOOP_PID_DIR=/var/hadoop/pids

# A string representing this instance of hadoop. $USER by default.
# export HADOOP_IDENT_STRING=$USER

# The scheduling priority for daemon processes. See ‘man nice’.
# export HADOOP_NICENESS=10

–检测安装是否成功
root@hadoop1:/opt/hadoop/bin# ./hadoop
Usage: hadoop [–config confdir] COMMAND
where COMMAND is one of:
namenode -format     format the DFS filesystem
secondarynamenode    run the DFS secondary namenode
namenode             run the DFS namenode
datanode             run a DFS datanode
dfsadmin             run a DFS admin client
fsck                 run a DFS filesystem checking utility
fs                   run a generic filesystem user client
balancer             run a cluster balancing utility
jobtracker           run the MapReduce job Tracker node
pipes                run a Pipes job
tasktracker          run a MapReduce task Tracker node
job                  manipulate MapReduce jobs
queue                get information regarding JobQueues
version              print the version
jar <jar>            run a jar file
distcp <srcurl> <desturl> copy file or directories recursively
archive -archiveName NAME <src>* <dest> create a hadoop archive
daemonlog            get/set the log level for each daemon
or
CLASSNAME            run the class named CLASSNAME
Most commands print help when invoked w/o parameters.

–配置hadoop-site.xml

root@hadoop1:/etc# cat hosts
127.0.0.1 localhost
127.0.1.1 hadoop1

# The following lines are desirable for IPv6 capable hosts
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters

get the hostname:hadoop1

<?xml version=”1.0″?>
<?xml-stylesheet type=”text/xsl” href=”configuration.xsl”?>
<!- Put site-specific property overrides in this file. ->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hadoop1:9000</value>
<description>The name of the default file system. Either the literal string “local” or a host:port for DFS.</description>
</property>
<property>
<name>mapred.job.tracker</name>
<value>hadoop1:9001</value>
<description>The host and port that the MapReduce job tracker runs at. If “local”, then jobs are run in-process as a single map and reduce task.</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/hadoop/hadoop-${user.name}</value>
<description>A base for other temporary directories.</description>
</property>
<property>
<name>dfs.name.dir</name>
<value>/home/hadoop/hadoop/filesystem/name</value>
<description>Determines where on the local filesystem the DFS name node should store the name table. If this is a comma-delimited list of directories then the name table is replicated in all of the directories, for redundancy. </description>
</property>
<property>
<name>dfs.data.dir</name>
<value>/home/hadoop/hadoop/filesystem/data</value>
<description>Determines where on the local filesystem an DFS data node should store its blocks. If this is a comma-delimited list of directories, then data will be stored in all named directories, typically on different devices. Directories that do not exist are ignored.</description>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
<description>Default block replication. The actual number of replications can be specified when the file is created. The default is used if replication is not specified in create time.</description>
</property>
</configuration>

–设置ssh
root@hadoop1:/opt/hadoop/conf# ps -e |grep ssh
1297 ? 00:00:00 ssh-agent
root@hadoop1:/opt/hadoop/conf# ssh localhost
ssh: connect to host localhost port 22: Connection refused

root@hadoop1:/opt/hadoop/conf# sudo apt-get install openssh-server
正在读取软件包列表… 完成
正在分析软件包的依赖关系树
正在读取状态信息… 完成
将会安装下列额外的软件包：
ssh-import-id
建议安装的软件包：
rssh molly-guard openssh-blacklist openssh-blacklist-extra
下列【新】软件包将被安装：
openssh-server ssh-import-id
升级了 0 个软件包，新安装了 2 个软件包，要卸载 0 个软件包，有 171 个软件包未被升级。
需要下载 317 kB 的软件包。
解压缩后会消耗掉 913 kB 的额外空

root@hadoop1:/opt/hadoop/conf# sudo /etc/init.d/ssh start
Rather than invoking init scripts through /etc/init.d, use the service(8)
utility, e.g. service ssh start

Since the script you are attempting to invoke has been converted to an
Upstart job, you may also use the start(8) utility, e.g. start ssh
root@hadoop1:/opt/hadoop/conf# ps -e |grep ssh
1297 ? 00:00:00 ssh-agent
2548 ? 00:00:00 sshd

–设置ssh不需输入密码登录
root@hadoop1:/opt/hadoop/conf# ssh-keygen -t rsa
Generating public/private rsa key pair.
Enter file in which to save the key (/root/.ssh/id_rsa):
Created directory ‘/root/.ssh’.
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
Your identification has been saved in /root/.ssh/id_rsa.
Your public key has been saved in /root/.ssh/id_rsa.pub.
The key fingerprint is:
c0:71:4e:38:4e:42:cb:cb:f2:32:52:16:6a:08:a7:66 root@hadoop1
The key’s randomart image is:
+–[ RSA 2048]—-+
|   .. ..o        |
|   ..o+=         |
|. o o+o..        |
|o+ o …         |
|+E+ o   S        |
|+o o             |
|. o .            |
| . o             |
|                 |
+—————–+

root@hadoop1:~/.ssh# cat id_rsa
—–BEGIN RSA PRIVATE KEY—–
MIIEowIBAAKCAQEArBID1Vnx4VsMZcnBzk/IXD33TInM1S38s8RhBVf6CeJLCjM8
34Uz/VM45BOHFbt3QNByHI4w7QaMzg0reOCiH3ndY9VfvnYtPjRcI2a2v0lRhW8n
wmy8vA1Mvugi5viIWiZMSwzPWxcj0faLyaptqIsSv3UFJdgw2dv78r9rKmT36kp8
+YKBY7us619vhC0rDa/cIQpTEJduCA/+fEUFdYzfVWCYAYnSij+PwFdVxUmaseL1
yXdPjvPB7mY0TsiARYBINgy198HNmtl6DPQU5JMPrkBBHCjUEmxaVlprdvJGTxAa
jDmT6ymGviDQz64CG1RsPGZZ4coT5vEzDcBOrQIDAQABAoIBAGWXxVekYWFyxmEa
vCi5hkyf8XDpM1zyZ+8jlTz4cpJ/X1rIYbp/BPEev8o5lVpJcuF7sMQXV9+6LExE
DlSyHEaFRyd4ha0ITTIa7AOi02chPNaRiGIMnWpSV+unV2QTfT3susLc71iS0v1W
mbYZv+DxK8KKrt1nRYB7l8f9KKLAPqEYIFzv07oFV/tXEgNZy7JKKTGfH1mzDn30
ucdWsau6Lxv2OAW7Ev01GSdGyP9PwL/seAFkNBRSFFZ32w6u1oqbM+1onxTzheoq
QsNGVhKMPAK/ILcbxgbAE1mVbqBMxlzX1i4Z0pEwJ5PaaZz1HgLSyMzrQ6oy1f2P
cOnzF8ECgYEA0obdPMbmLWHHZoDEKeUafhU/g9FpaMA3H9w08bGwK7cv5hRAZBIs
yxy48oygcV0vnfC9IHU9Qkc+92AkbTZ2sFLgahRM4cDWChd1g4YAgm5R7EjzMnsq
XJQ6cm6JGUQKwVoYfw06gaJeFBle7XDonogR3YOdEwKHL2T3XBe5XyUCgYEA0Tyx
DiWrIWIGXPxZr2HWHnYC/7ldFNzboGbSn3fJGN7tm33Fyd4P14NHWXA2LreRbrKV
oZRZCBOSLAGFFDUqhLAK6UhKZDi0CbYRZvnZ4T1UB6em7sBZxgIhe/lidjfba8Ce
uJthCXOwK+sZUNEC84AYQdiAwvBrgB0cqcp6/ukCgYB+m5avd1p0DmqxrVzLaTmu
e67P4n+G/JnqMi8w71BoaemHb8RtqjSADgz36TSQGJ+LV30V6QvnMRuf/5TIjUmB
rsXBZeX6mlLejM8iQtNJnXjtJc4EHOgT9xzTNsCCjlX9g+ZZeiYmUfRMGBnrp8xt
kp1T31P2W73a9diA7M+RdQKBgGbMEJjIvOjrxgCNImreAFp61EJbCXYkg+qsWbJr
IWuMquQHyNLCvLm/D6DLVnNhUJw4NPdrcMpdAyWHoaAp9re13cZ688GFcj6LKsWS
3w6gGNah8Yu/CNwVU+oavdsi2jR4MAK2o9gG9Hi/SnLAHVkQh9phyfD8OXR52Qk6
J29xAoGBALKMgnHdvyRLR8V2w0R9wwxyJtw51jAsS2TUMq0kvweNjE3lGO8zL3tu
xspIIC1VIxBt6BBZ/Hyc+jdvX+ExxZnd3z1piVBdwpN6iNxp6TDklAwL2LCoJBhU
KyxKkmeKAZ8Tr+M7jnmQDfkFYxU6vzPutmFI7Zi3qVP1bxFEZKXT
—–END RSA PRIVATE KEY—–

root@hadoop1:~/.ssh# cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
root@hadoop1:~/.ssh# cat authorized_keys
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCsEgPVWfHhWwxlycHOT8hcPfdMiczVLfyzxGEFV/oJ4ksKMzzfhTP9UzjkE4cVu3dA0HIcjjDtBozODSt44KIfed1j1V++di0+NFwjZra/SVGFbyfCbLy8DUy+6CLm+IhaJkxLDM9bFyPR9ovJqm2oixK/dQUl2DDZ2/vyv2sqZPfqSnz5goFju6zrX2+ELSsNr9whClMQl24ID/58RQV1jN9VYJgBidKKP4/AV1XFSZqx4vXJd0+O88HuZjROyIBFgEg2DLX3wc2a2XoM9BTkkw+uQEEcKNQSbFpWWmt28kZPEBqMOZPrKYa+INDPrgIbVGw8ZlnhyhPm8TMNwE6t root@hadoop1

–test
root@hadoop1:~/.ssh# ssh localhost
The authenticity of host ‘localhost (127.0.0.1)’ can’t be established.
ECDSA key fingerprint is e8:0c:77:fc:b5:1a:76:46:e1:97:ec:e0:d2:ae:b1:cd.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added ‘localhost’ (ECDSA) to the list of known hosts.
Welcome to Ubuntu 11.04 (GNU/Linux 2.6.38-8-generic i686)

* Documentation: https://help.ubuntu.com/

The programs included with the Ubuntu system are free software;
the exact distribution terms for each program are described in the
individual files in /usr/share/doc/*/copyright.

Ubuntu comes with ABSOLUTELY NO WARRANTY, to the extent permitted by
applicable law.

–启动hadoop
root@hadoop1:/opt/hadoop/conf# source hadoop-env.sh
root@hadoop1:/opt/hadoop/conf# hadoop namenode -format
11/06/23 17:23:59 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = hadoop1/127.0.1.1
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 0.19.0
STARTUP_MSG:   build = https://svn.apache.org/repos/asf/hadoop/core/branches/branch-0.19 -r 713890; compiled by ‘ndaley’ on Fri Nov 14 03:12:29 UTC 2008
************************************************************/
11/06/23 17:24:00 INFO namenode.FSNamesystem: fsOwner=root,root
11/06/23 17:24:00 INFO namenode.FSNamesystem: supergroup=supergroup
11/06/23 17:24:00 INFO namenode.FSNamesystem: isPermissionEnabled=true
11/06/23 17:24:00 INFO common.Storage: Image file of size 94 saved in 0 seconds.
11/06/23 17:24:00 INFO common.Storage: Storage directory /home/hadoop/hadoop/filesystem/name has been successfully formatted.
11/06/23 17:24:00 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at hadoop1/127.0.1.1
************************************************************/

–看看这个dir下多了一个hadoop
oop1:/home# ls
hadoop lpxuan
root@hadoop1:/home# cd hadoop/hadoop/filesystem/
root@hadoop1:/home/hadoop/hadoop/filesystem# ls
name

root@hadoop1:/opt/hadoop/bin# start-all.sh
starting namenode, logging to /opt/hadoop/logs/hadoop-root-namenode-hadoop1.out
localhost: starting datanode, logging to /opt/hadoop/logs/hadoop-root-datanode-hadoop1.out
localhost: starting secondarynamenode, logging to /opt/hadoop/logs/hadoop-root-secondarynamenode-hadoop1.out
starting jobtracker, logging to /opt/hadoop/logs/hadoop-root-jobtracker-hadoop1.out
localhost: starting tasktracker, logging to /opt/hadoop/logs/hadoop-root-tasktracker-hadoop1.out

root@hadoop1:/opt/hadoop/bin# jps
3216 DataNode
3556 TaskTracker
3065 NameNode
3358 SecondaryNameNode
3416 JobTracker
8303 Jps

–hive
root@hadoop1:/opt/hadoop/bin# export HADOOP=/opt/hadoop
root@hadoop1:/opt/hadoop/bin# export HIVE_HOME=/opt/hadoop/contrib/hive
root@hadoop1:/opt/hadoop/bin# cd $HIVE_HOME
root@hadoop1:/opt/hadoop/contrib/hive# bin/hive
hive> create table pokes(foo INT, bar STRING);
OK
Time taken: 5.887 seconds
hive> create table invites(foo INT, bar STRING) PARTITIONED BY (ds STRING);
OK
Time taken: 0.096 seconds
hive> SHOW TABLES;
OK
invites    pokes
Time taken: 0.413 seconds
hive> DESCRIBE invites;
OK
foo    int
bar    string
ds    string
Time taken: 0.192 seconds
hive> ALTER TABLE pokes ADD COLUMNS(new_col INT);
OK
Time taken: 0.145 seconds
hive> ALTER TABLE invites ADD COLUMNS(newe_col2 INT COMMENT ‘a comment’);
OK
Time taken: 0.127 seconds
hive> DESCRIBE pokes
> ;
OK
foo    int
bar    string
new_col    int
Time taken: 0.094 seconds
hive> DESCRIBE invites;
OK
foo    int
bar    string
newe_col2    int    ‘a comment’
ds    string
Time taken: 0.064 seconds

转载请注明：数据分析 » ubuntu安装配置hadoop和hive