10、网站用户行为分析
2.1 数据准备
数据集下载与查看
cd /usr/local
ls
sudo mkdir bigdatacase
sudo chown -R hadoop:hadoop ./bigdatacase
cd bigdatacase
mkdir dataset
cd ~/下载
ls
unzip user.zip -d /usr/local/bigdatacase/dataset
cd /usr/local/bigdatacase/dataset
ls
head -5 raw_user.csv
数据集预处理
sed -i ‘1d’ raw_user
head -5 raw_user.csv
vim pre_deal.sh
bash ./pre_deal.sh raw_user.csv raw_user.txt
head -10 raw_user.txt
把数据集导入HDFS中
hdfs dfs -mkdir -p /bigdatacase/dataset
hdfs dfs -put /usr/local/bigdatacase/dataset/raw_user.txt /bigdatacase/dataset
hdfs dfs -ls /bigdatacase/dataset
hdfs dfs -cat /bigdatacase/dataset/raw_user.txt | head -10
在Hive上创建数据库
hive
create database dblab;
use dblab;
CREATE EXTERNAL TABLE dblab.rawdata_user(id INT,uid STRING,item_id STRING,behavior_type INT,item_category STRING,visit_date DATE,province STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY \’\t\’
STORED AS TEXTFILE
LOCATION \’/bigdatacase/dataset\’;
select * from rawdata_user limit 10;
2.2 Hive数据分析
1)用户行为分析需求:2014-12-11~12号有多少条购买商品的记录
分析步骤
- 语句:select count(*) from bigdata_user where visit_date >\’2014-12-10\’ and visit_date <\’2014-12-13\’ and behavior_type=\’4\’ limit 10;
- 结果截图:运行或存为表格后的查询显示
-
(2)用户行为分析需求:分析每月1-31号购买情况
- 语句:select count(distinct uid), day(visit_date) from bigdata_user where behavior_type=\’4\’ group by day(visit_date);
- 结果截图:运行或存为表格后的查询显示
- (3)按某一特殊日期(如双12)进行用户行为分析
- 各省份购买商品数量
-
Select count(province),province from rawdata_user where behavior_type=‘4’ group by province;
-
- 商品购买与浏览比例
-
Create table user_12 as select * from rawdata_user where visit_data=‘2014-12-12’; Select count(*) from user_12 where behavior_type=‘4’;
-
select count(*) from user_12;
-
- 用户活跃度分析
- select count(*) from user_12 where uid=10001082;
-
- 购买5件以上商品的用户
- select uid,count(*) from user_12 where behavior_type=\’4\’ group by uid having count(*)>5 limit 10;
-
(3)自定义需求:用户购买次数大于5的用户id和地区信息以及购买次数
- 语句:select uid,count(*),province from rawdata_user where behavior_type=\’4\’ group by uid,province having count(behavior_type=\’4\’)>5 limit 10;
- 结果截图:运行或存为表格后的查询显示
-