HDFS disk consumption – Find what is taking hdfs space
Source: https://community.hortonworks.com/articles/16846/how-to-identify-what-is-consuming-space-in-hdfs.html
Script
#!/usr/bin/env bash max_depth=5 largest_root_dirs=$(hdfs dfs -du -s '/*' | sort -nr | perl -ane 'print "$F[1] "') printf "%15s %s\n" "bytes" "directory" for ld in $largest_root_dirs; do printf "%15.0f %s\n" $(hdfs dfs -du -s $ld| cut -d' ' -f1) $ld all_dirs=$(hdfs dfs -ls -R $ld | egrep '^dr........' | perl -ane "scalar(split('/',\$_)) <= $max_depth && print \"\$F[7]\n\"" ) for d in $all_dirs; do line=$(hdfs dfs -du -s $d) size=$(echo $line | cut -d' ' -f1) parent_dir=${d%/*} child=${d##*/} if [ -n "$parent_dir" ]; then leading_dirs=$(echo $parent_dir | perl -pe 's/./-/g; s/^.(.+)$/\|$1/') d=${leading_dirs}/$child fi printf "%15.0f %s\n" $size $d done done
Output
bytes directory 480376973 /hdp 480376973 |---/apps 480376973 |--------/2.3.4.0-3485 98340772 |---------------------/hive 210320342 |---------------------/mapreduce 97380893 |---------------------/pig 15830286 |---------------------/sqoop 58504680 |---------------------/tez 24453973 /user 0 |----/admin 3629715 |----/ambari-qa 3440200 |--------------/.staging 653010 |-----------------------/job_1454293069490_0001
Other options
https://github.com/mr-jstraub/HDFSQuota/blob/master/HDFSQuota.ipynb
https://github.com/twitter/hdfs-du