...
| Code Block |
|---|
#!/bin/bash
# Benchmark using ollama gives rate of tokens per second
# idea taken from https://taoofmac.com/space/blog/2024/01/20/1800
# other colors
#Black 0;30 Dark Gray 1;30
#Red 0;31 Light Red 1;31
#Green 0;32 Light Green 1;32
#Brown/Orange 0;33 Yellow 1;33
#Blue 0;34 Light Blue 1;34
#Purple 0;35 Light Purple 1;35
#Cyan 0;36 Light Cyan 1;36
#Light Gray 0;37 White 1;37
#ANSI option
#RED='\033[0;31m'
#NC='\033[0m' # No Color
#echo -e "${red}Hello Stackoverflow${NC}"
#set -e used for troubleshooting
set -e
#colors available
borange='\e[0;33m'
yellow='\e[1;33m'
purple='\e[0;35m'
green='\e[0;32m'
red='\e[0;31m'
blue='\e[0;34m'
NC='\e[0m' # No Color
cpu_def=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)
echo "Setting cpu governor to"
sudo echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
gpu_avail=$(sudo lshw -C display | grep product: | head -1 | cut -c17-)
cpugover=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)
cpu_used=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1')
echo ""
echo "Simple benchmark using ollama and"
echo "whatever local Model is installed."
echo "Does not identify if $gpu_avail is benchmarking"
echo ""
benchmark=3
echo "How many times to run the benchmark?"
echo $benchmark
echo ""
for model in `ollama ls |awk '{print $1}'|grep -v NAME`; do
echo -e "Total runs "${purple}$benchmark${NC}
echo ""
#echo "Current models available locally"
#echo ""
#ollama list
#echo ""
#echo "Example enter tinyllama or dolphin-phi"
echo ""
echo $model
ollama show $model --system
echo "" | tee -a results.txt
echo -e "Will use model: "${green}$model${NC} | tee -a results.txt
echo "" | tee -a results.txt
echo -e Will benchmark the tokens per second for ${cpu_used} and or ${gpu_avail} | tee -a results.txt
echo "" | tee -a results.txt
echo "" | tee -a results.txt
echo -e Running benchmark ${purple}$benchmark${NC} times for ${cpu_used} and or ${gpu_avail} | tee -a results.txt
echo -e with ${borange}$cpugover${NC} setting for cpu governor | tee -a results.txt
echo "" | tee -a results.txt
for run in $(seq 1 $benchmark); do
echo "Why is the blue sky blue?" | ollama run $model --verbose 2>&1 >/dev/null | grep "eval rate:" | tee -a results.txt ;
avg=$(cat results.txt | grep -v "prompt eval rate:" |tail -n $benchmark | awk '{print $3}' | awk 'NR>1{ tot+=$1 } END{ print tot/(NR-1) }')
done
echo "" | tee -a results.txt
echo -e ${red}$avg${NC} is the average ${blue}tokens per second${NC} using ${green}$model${NC} model | tee -a results.txt
echo for $cpu_used and or $gpu_avail | tee -a results.txt
done
echo
echo -e using ${borange}$cpugover${NC} for cpu governor.
echo ""
echo "Setting cpu governor to"
sudo echo $cpu_def | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
#comment this out if you are repeating the same model
#this clears model from Vram
sudo systemctl stop ollama; sudo systemctl start ollama
#EOF |
FROM REDIT
| Code Block |
|---|
Old quant types (some base model types require these):
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
New quant types (recommended):
- Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss
- Q3_K_M: very small, very high quality loss
- Q3_K_L: small, substantial quality loss
- Q4_K: alias for Q4_K_M
- Q4_K_S: small, significant quality loss
- Q4_K_M: medium, balanced quality - recommended
- Q5_K: alias for Q5_K_M
- Q5_K_S: large, low quality loss - recommended
- Q5_K_M: large, very low quality loss - recommended
- Q6_K: very large, extremely low quality loss
- Q8_0: very large, extremely low quality loss - not recommended
- F16: extremely large, virtually no quality loss - not recommended
- F32: absolutely huge, lossless - not recommended |