<|im_end|>\n"
]
}
],
"source": [
"print(dataset['text'][0])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0f1cff57-b914-4f6f-ab1c-aa2f564486ca",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Map (num_proc=2): 100%|██████████| 2572/2572 [00:03<00:00, 663.48 examples/s]\n",
"Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
]
}
],
"source": [
"from trl import SFTTrainer\n",
"from transformers import TrainingArguments\n",
"from unsloth import is_bfloat16_supported\n",
"\n",
"trainer = SFTTrainer(\n",
" model = model,\n",
" tokenizer = tokenizer,\n",
" train_dataset = dataset,\n",
" dataset_text_field = \"text\",\n",
" max_seq_length = max_seq_length,\n",
" dataset_num_proc = 2,\n",
" packing = False,\n",
" args = TrainingArguments(\n",
" per_device_train_batch_size = 2,\n",
" gradient_accumulation_steps = 4,\n",
" warmup_steps = 5,\n",
" num_train_epochs = 1, # Set this for 1 full training run.\n",
" # max_steps = 60,\n",
" learning_rate = 2e-4,\n",
" fp16 = not is_bfloat16_supported(),\n",
" bf16 = is_bfloat16_supported(),\n",
" logging_steps = 1,\n",
" optim = \"adamw_8bit\",\n",
" weight_decay = 0.01,\n",
" lr_scheduler_type = \"linear\",\n",
" seed = 3407,\n",
" output_dir = \"outputs\",\n",
" ),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5e5f1095-e0c0-484f-bba4-de3480b50419",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1\n",
" \\\\ /| Num examples = 2,572 | Num Epochs = 1\n",
"O^O/ \\_/ \\ Batch size per device = 2 | Gradient Accumulation steps = 4\n",
"\\ / Total batch size = 8 | Total steps = 321\n",
" \"-____-\" Number of trainable parameters = 40,370,176\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
"
\n",
" [321/321 38:52, Epoch 0/1]\n",
"
\n",
" \n",
" \n",
" \n",
" Step | \n",
" Training Loss | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1.788500 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.704700 | \n",
"
\n",
" \n",
" 3 | \n",
" 1.656400 | \n",
"
\n",
" \n",
" 4 | \n",
" 1.652300 | \n",
"
\n",
" \n",
" 5 | \n",
" 1.522300 | \n",
"
\n",
" \n",
" 6 | \n",
" 1.423100 | \n",
"
\n",
" \n",
" 7 | \n",
" 1.321000 | \n",
"
\n",
" \n",
" 8 | \n",
" 1.243200 | \n",
"
\n",
" \n",
" 9 | \n",
" 1.150800 | \n",
"
\n",
" \n",
" 10 | \n",
" 1.088500 | \n",
"
\n",
" \n",
" 11 | \n",
" 0.917600 | \n",
"
\n",
" \n",
" 12 | \n",
" 0.887900 | \n",
"
\n",
" \n",
" 13 | \n",
" 0.792400 | \n",
"
\n",
" \n",
" 14 | \n",
" 0.567600 | \n",
"
\n",
" \n",
" 15 | \n",
" 0.544000 | \n",
"
\n",
" \n",
" 16 | \n",
" 0.460500 | \n",
"
\n",
" \n",
" 17 | \n",
" 0.461500 | \n",
"
\n",
" \n",
" 18 | \n",
" 0.584600 | \n",
"
\n",
" \n",
" 19 | \n",
" 0.328500 | \n",
"
\n",
" \n",
" 20 | \n",
" 0.420900 | \n",
"
\n",
" \n",
" 21 | \n",
" 0.441000 | \n",
"
\n",
" \n",
" 22 | \n",
" 0.438100 | \n",
"
\n",
" \n",
" 23 | \n",
" 0.276400 | \n",
"
\n",
" \n",
" 24 | \n",
" 0.359400 | \n",
"
\n",
" \n",
" 25 | \n",
" 0.319300 | \n",
"
\n",
" \n",
" 26 | \n",
" 0.414800 | \n",
"
\n",
" \n",
" 27 | \n",
" 0.294700 | \n",
"
\n",
" \n",
" 28 | \n",
" 0.474200 | \n",
"
\n",
" \n",
" 29 | \n",
" 0.326000 | \n",
"
\n",
" \n",
" 30 | \n",
" 0.376400 | \n",
"
\n",
" \n",
" 31 | \n",
" 0.340200 | \n",
"
\n",
" \n",
" 32 | \n",
" 0.318000 | \n",
"
\n",
" \n",
" 33 | \n",
" 0.371900 | \n",
"
\n",
" \n",
" 34 | \n",
" 0.286000 | \n",
"
\n",
" \n",
" 35 | \n",
" 0.386600 | \n",
"
\n",
" \n",
" 36 | \n",
" 0.321400 | \n",
"
\n",
" \n",
" 37 | \n",
" 0.320800 | \n",
"
\n",
" \n",
" 38 | \n",
" 0.238400 | \n",
"
\n",
" \n",
" 39 | \n",
" 0.251800 | \n",
"
\n",
" \n",
" 40 | \n",
" 0.207500 | \n",
"
\n",
" \n",
" 41 | \n",
" 0.273500 | \n",
"
\n",
" \n",
" 42 | \n",
" 0.268000 | \n",
"
\n",
" \n",
" 43 | \n",
" 0.207900 | \n",
"
\n",
" \n",
" 44 | \n",
" 0.236400 | \n",
"
\n",
" \n",
" 45 | \n",
" 0.190500 | \n",
"
\n",
" \n",
" 46 | \n",
" 0.236000 | \n",
"
\n",
" \n",
" 47 | \n",
" 0.162100 | \n",
"
\n",
" \n",
" 48 | \n",
" 0.237900 | \n",
"
\n",
" \n",
" 49 | \n",
" 0.167600 | \n",
"
\n",
" \n",
" 50 | \n",
" 0.170900 | \n",
"
\n",
" \n",
" 51 | \n",
" 0.215200 | \n",
"
\n",
" \n",
" 52 | \n",
" 0.147000 | \n",
"
\n",
" \n",
" 53 | \n",
" 0.169300 | \n",
"
\n",
" \n",
" 54 | \n",
" 0.153400 | \n",
"
\n",
" \n",
" 55 | \n",
" 0.159200 | \n",
"
\n",
" \n",
" 56 | \n",
" 0.143600 | \n",
"
\n",
" \n",
" 57 | \n",
" 0.157200 | \n",
"
\n",
" \n",
" 58 | \n",
" 0.161700 | \n",
"
\n",
" \n",
" 59 | \n",
" 0.139200 | \n",
"
\n",
" \n",
" 60 | \n",
" 0.144800 | \n",
"
\n",
" \n",
" 61 | \n",
" 0.133000 | \n",
"
\n",
" \n",
" 62 | \n",
" 0.176000 | \n",
"
\n",
" \n",
" 63 | \n",
" 0.156500 | \n",
"
\n",
" \n",
" 64 | \n",
" 0.116100 | \n",
"
\n",
" \n",
" 65 | \n",
" 0.126500 | \n",
"
\n",
" \n",
" 66 | \n",
" 0.133000 | \n",
"
\n",
" \n",
" 67 | \n",
" 0.172600 | \n",
"
\n",
" \n",
" 68 | \n",
" 0.112600 | \n",
"
\n",
" \n",
" 69 | \n",
" 0.082300 | \n",
"
\n",
" \n",
" 70 | \n",
" 0.138400 | \n",
"
\n",
" \n",
" 71 | \n",
" 0.140700 | \n",
"
\n",
" \n",
" 72 | \n",
" 0.109400 | \n",
"
\n",
" \n",
" 73 | \n",
" 0.104000 | \n",
"
\n",
" \n",
" 74 | \n",
" 0.115100 | \n",
"
\n",
" \n",
" 75 | \n",
" 0.118900 | \n",
"
\n",
" \n",
" 76 | \n",
" 0.104100 | \n",
"
\n",
" \n",
" 77 | \n",
" 0.089600 | \n",
"
\n",
" \n",
" 78 | \n",
" 0.102300 | \n",
"
\n",
" \n",
" 79 | \n",
" 0.084700 | \n",
"
\n",
" \n",
" 80 | \n",
" 0.139100 | \n",
"
\n",
" \n",
" 81 | \n",
" 0.083000 | \n",
"
\n",
" \n",
" 82 | \n",
" 0.133300 | \n",
"
\n",
" \n",
" 83 | \n",
" 0.122800 | \n",
"
\n",
" \n",
" 84 | \n",
" 0.103400 | \n",
"
\n",
" \n",
" 85 | \n",
" 0.076700 | \n",
"
\n",
" \n",
" 86 | \n",
" 0.063800 | \n",
"
\n",
" \n",
" 87 | \n",
" 0.139000 | \n",
"
\n",
" \n",
" 88 | \n",
" 0.073300 | \n",
"
\n",
" \n",
" 89 | \n",
" 0.117800 | \n",
"
\n",
" \n",
" 90 | \n",
" 0.061400 | \n",
"
\n",
" \n",
" 91 | \n",
" 0.115300 | \n",
"
\n",
" \n",
" 92 | \n",
" 0.114000 | \n",
"
\n",
" \n",
" 93 | \n",
" 0.091000 | \n",
"
\n",
" \n",
" 94 | \n",
" 0.061000 | \n",
"
\n",
" \n",
" 95 | \n",
" 0.063000 | \n",
"
\n",
" \n",
" 96 | \n",
" 0.071300 | \n",
"
\n",
" \n",
" 97 | \n",
" 0.076700 | \n",
"
\n",
" \n",
" 98 | \n",
" 0.079000 | \n",
"
\n",
" \n",
" 99 | \n",
" 0.087500 | \n",
"
\n",
" \n",
" 100 | \n",
" 0.061000 | \n",
"
\n",
" \n",
" 101 | \n",
" 0.077000 | \n",
"
\n",
" \n",
" 102 | \n",
" 0.097900 | \n",
"
\n",
" \n",
" 103 | \n",
" 0.072200 | \n",
"
\n",
" \n",
" 104 | \n",
" 0.107800 | \n",
"
\n",
" \n",
" 105 | \n",
" 0.083100 | \n",
"
\n",
" \n",
" 106 | \n",
" 0.050400 | \n",
"
\n",
" \n",
" 107 | \n",
" 0.098600 | \n",
"
\n",
" \n",
" 108 | \n",
" 0.105700 | \n",
"
\n",
" \n",
" 109 | \n",
" 0.076400 | \n",
"
\n",
" \n",
" 110 | \n",
" 0.053600 | \n",
"
\n",
" \n",
" 111 | \n",
" 0.086500 | \n",
"
\n",
" \n",
" 112 | \n",
" 0.049800 | \n",
"
\n",
" \n",
" 113 | \n",
" 0.106800 | \n",
"
\n",
" \n",
" 114 | \n",
" 0.063800 | \n",
"
\n",
" \n",
" 115 | \n",
" 0.075500 | \n",
"
\n",
" \n",
" 116 | \n",
" 0.059300 | \n",
"
\n",
" \n",
" 117 | \n",
" 0.104200 | \n",
"
\n",
" \n",
" 118 | \n",
" 0.079300 | \n",
"
\n",
" \n",
" 119 | \n",
" 0.072400 | \n",
"
\n",
" \n",
" 120 | \n",
" 0.075000 | \n",
"
\n",
" \n",
" 121 | \n",
" 0.064000 | \n",
"
\n",
" \n",
" 122 | \n",
" 0.058900 | \n",
"
\n",
" \n",
" 123 | \n",
" 0.049700 | \n",
"
\n",
" \n",
" 124 | \n",
" 0.123200 | \n",
"
\n",
" \n",
" 125 | \n",
" 0.084100 | \n",
"
\n",
" \n",
" 126 | \n",
" 0.050400 | \n",
"
\n",
" \n",
" 127 | \n",
" 0.084200 | \n",
"
\n",
" \n",
" 128 | \n",
" 0.085200 | \n",
"
\n",
" \n",
" 129 | \n",
" 0.094800 | \n",
"
\n",
" \n",
" 130 | \n",
" 0.070500 | \n",
"
\n",
" \n",
" 131 | \n",
" 0.044100 | \n",
"
\n",
" \n",
" 132 | \n",
" 0.055200 | \n",
"
\n",
" \n",
" 133 | \n",
" 0.079600 | \n",
"
\n",
" \n",
" 134 | \n",
" 0.068100 | \n",
"
\n",
" \n",
" 135 | \n",
" 0.043400 | \n",
"
\n",
" \n",
" 136 | \n",
" 0.042700 | \n",
"
\n",
" \n",
" 137 | \n",
" 0.045900 | \n",
"
\n",
" \n",
" 138 | \n",
" 0.044200 | \n",
"
\n",
" \n",
" 139 | \n",
" 0.028800 | \n",
"
\n",
" \n",
" 140 | \n",
" 0.083500 | \n",
"
\n",
" \n",
" 141 | \n",
" 0.097000 | \n",
"
\n",
" \n",
" 142 | \n",
" 0.076600 | \n",
"
\n",
" \n",
" 143 | \n",
" 0.060900 | \n",
"
\n",
" \n",
" 144 | \n",
" 0.091200 | \n",
"
\n",
" \n",
" 145 | \n",
" 0.101800 | \n",
"
\n",
" \n",
" 146 | \n",
" 0.064100 | \n",
"
\n",
" \n",
" 147 | \n",
" 0.059300 | \n",
"
\n",
" \n",
" 148 | \n",
" 0.055800 | \n",
"
\n",
" \n",
" 149 | \n",
" 0.059800 | \n",
"
\n",
" \n",
" 150 | \n",
" 0.068300 | \n",
"
\n",
" \n",
" 151 | \n",
" 0.049300 | \n",
"
\n",
" \n",
" 152 | \n",
" 0.059400 | \n",
"
\n",
" \n",
" 153 | \n",
" 0.051600 | \n",
"
\n",
" \n",
" 154 | \n",
" 0.025700 | \n",
"
\n",
" \n",
" 155 | \n",
" 0.054900 | \n",
"
\n",
" \n",
" 156 | \n",
" 0.048400 | \n",
"
\n",
" \n",
" 157 | \n",
" 0.068600 | \n",
"
\n",
" \n",
" 158 | \n",
" 0.066500 | \n",
"
\n",
" \n",
" 159 | \n",
" 0.074800 | \n",
"
\n",
" \n",
" 160 | \n",
" 0.046100 | \n",
"
\n",
" \n",
" 161 | \n",
" 0.079600 | \n",
"
\n",
" \n",
" 162 | \n",
" 0.071600 | \n",
"
\n",
" \n",
" 163 | \n",
" 0.062200 | \n",
"
\n",
" \n",
" 164 | \n",
" 0.081800 | \n",
"
\n",
" \n",
" 165 | \n",
" 0.050500 | \n",
"
\n",
" \n",
" 166 | \n",
" 0.049800 | \n",
"
\n",
" \n",
" 167 | \n",
" 0.062800 | \n",
"
\n",
" \n",
" 168 | \n",
" 0.039000 | \n",
"
\n",
" \n",
" 169 | \n",
" 0.063800 | \n",
"
\n",
" \n",
" 170 | \n",
" 0.053100 | \n",
"
\n",
" \n",
" 171 | \n",
" 0.099100 | \n",
"
\n",
" \n",
" 172 | \n",
" 0.046800 | \n",
"
\n",
" \n",
" 173 | \n",
" 0.051000 | \n",
"
\n",
" \n",
" 174 | \n",
" 0.039900 | \n",
"
\n",
" \n",
" 175 | \n",
" 0.071700 | \n",
"
\n",
" \n",
" 176 | \n",
" 0.058300 | \n",
"
\n",
" \n",
" 177 | \n",
" 0.047000 | \n",
"
\n",
" \n",
" 178 | \n",
" 0.037900 | \n",
"
\n",
" \n",
" 179 | \n",
" 0.036300 | \n",
"
\n",
" \n",
" 180 | \n",
" 0.069000 | \n",
"
\n",
" \n",
" 181 | \n",
" 0.063400 | \n",
"
\n",
" \n",
" 182 | \n",
" 0.070700 | \n",
"
\n",
" \n",
" 183 | \n",
" 0.039900 | \n",
"
\n",
" \n",
" 184 | \n",
" 0.047500 | \n",
"
\n",
" \n",
" 185 | \n",
" 0.039100 | \n",
"
\n",
" \n",
" 186 | \n",
" 0.040700 | \n",
"
\n",
" \n",
" 187 | \n",
" 0.041100 | \n",
"
\n",
" \n",
" 188 | \n",
" 0.040800 | \n",
"
\n",
" \n",
" 189 | \n",
" 0.030300 | \n",
"
\n",
" \n",
" 190 | \n",
" 0.050300 | \n",
"
\n",
" \n",
" 191 | \n",
" 0.046000 | \n",
"
\n",
" \n",
" 192 | \n",
" 0.048800 | \n",
"
\n",
" \n",
" 193 | \n",
" 0.061800 | \n",
"
\n",
" \n",
" 194 | \n",
" 0.035900 | \n",
"
\n",
" \n",
" 195 | \n",
" 0.045500 | \n",
"
\n",
" \n",
" 196 | \n",
" 0.066200 | \n",
"
\n",
" \n",
" 197 | \n",
" 0.045200 | \n",
"
\n",
" \n",
" 198 | \n",
" 0.078800 | \n",
"
\n",
" \n",
" 199 | \n",
" 0.048200 | \n",
"
\n",
" \n",
" 200 | \n",
" 0.051000 | \n",
"
\n",
" \n",
" 201 | \n",
" 0.067500 | \n",
"
\n",
" \n",
" 202 | \n",
" 0.048600 | \n",
"
\n",
" \n",
" 203 | \n",
" 0.041000 | \n",
"
\n",
" \n",
" 204 | \n",
" 0.066300 | \n",
"
\n",
" \n",
" 205 | \n",
" 0.039200 | \n",
"
\n",
" \n",
" 206 | \n",
" 0.057100 | \n",
"
\n",
" \n",
" 207 | \n",
" 0.048000 | \n",
"
\n",
" \n",
" 208 | \n",
" 0.027000 | \n",
"
\n",
" \n",
" 209 | \n",
" 0.050800 | \n",
"
\n",
" \n",
" 210 | \n",
" 0.044900 | \n",
"
\n",
" \n",
" 211 | \n",
" 0.042800 | \n",
"
\n",
" \n",
" 212 | \n",
" 0.032800 | \n",
"
\n",
" \n",
" 213 | \n",
" 0.049300 | \n",
"
\n",
" \n",
" 214 | \n",
" 0.035000 | \n",
"
\n",
" \n",
" 215 | \n",
" 0.071400 | \n",
"
\n",
" \n",
" 216 | \n",
" 0.080100 | \n",
"
\n",
" \n",
" 217 | \n",
" 0.091400 | \n",
"
\n",
" \n",
" 218 | \n",
" 0.035700 | \n",
"
\n",
" \n",
" 219 | \n",
" 0.035700 | \n",
"
\n",
" \n",
" 220 | \n",
" 0.045200 | \n",
"
\n",
" \n",
" 221 | \n",
" 0.034100 | \n",
"
\n",
" \n",
" 222 | \n",
" 0.039000 | \n",
"
\n",
" \n",
" 223 | \n",
" 0.035000 | \n",
"
\n",
" \n",
" 224 | \n",
" 0.066000 | \n",
"
\n",
" \n",
" 225 | \n",
" 0.044600 | \n",
"
\n",
" \n",
" 226 | \n",
" 0.039100 | \n",
"
\n",
" \n",
" 227 | \n",
" 0.023700 | \n",
"
\n",
" \n",
" 228 | \n",
" 0.055200 | \n",
"
\n",
" \n",
" 229 | \n",
" 0.034500 | \n",
"
\n",
" \n",
" 230 | \n",
" 0.041800 | \n",
"
\n",
" \n",
" 231 | \n",
" 0.045400 | \n",
"
\n",
" \n",
" 232 | \n",
" 0.050800 | \n",
"
\n",
" \n",
" 233 | \n",
" 0.040600 | \n",
"
\n",
" \n",
" 234 | \n",
" 0.047800 | \n",
"
\n",
" \n",
" 235 | \n",
" 0.029800 | \n",
"
\n",
" \n",
" 236 | \n",
" 0.081300 | \n",
"
\n",
" \n",
" 237 | \n",
" 0.052800 | \n",
"
\n",
" \n",
" 238 | \n",
" 0.058700 | \n",
"
\n",
" \n",
" 239 | \n",
" 0.093300 | \n",
"
\n",
" \n",
" 240 | \n",
" 0.092700 | \n",
"
\n",
" \n",
" 241 | \n",
" 0.058200 | \n",
"
\n",
" \n",
" 242 | \n",
" 0.062700 | \n",
"
\n",
" \n",
" 243 | \n",
" 0.096400 | \n",
"
\n",
" \n",
" 244 | \n",
" 0.033400 | \n",
"
\n",
" \n",
" 245 | \n",
" 0.034700 | \n",
"
\n",
" \n",
" 246 | \n",
" 0.035800 | \n",
"
\n",
" \n",
" 247 | \n",
" 0.056900 | \n",
"
\n",
" \n",
" 248 | \n",
" 0.066100 | \n",
"
\n",
" \n",
" 249 | \n",
" 0.042600 | \n",
"
\n",
" \n",
" 250 | \n",
" 0.057200 | \n",
"
\n",
" \n",
" 251 | \n",
" 0.025500 | \n",
"
\n",
" \n",
" 252 | \n",
" 0.032900 | \n",
"
\n",
" \n",
" 253 | \n",
" 0.036500 | \n",
"
\n",
" \n",
" 254 | \n",
" 0.061700 | \n",
"
\n",
" \n",
" 255 | \n",
" 0.046000 | \n",
"
\n",
" \n",
" 256 | \n",
" 0.028400 | \n",
"
\n",
" \n",
" 257 | \n",
" 0.043100 | \n",
"
\n",
" \n",
" 258 | \n",
" 0.053200 | \n",
"
\n",
" \n",
" 259 | \n",
" 0.070800 | \n",
"
\n",
" \n",
" 260 | \n",
" 0.031700 | \n",
"
\n",
" \n",
" 261 | \n",
" 0.044800 | \n",
"
\n",
" \n",
" 262 | \n",
" 0.031000 | \n",
"
\n",
" \n",
" 263 | \n",
" 0.023300 | \n",
"
\n",
" \n",
" 264 | \n",
" 0.049600 | \n",
"
\n",
" \n",
" 265 | \n",
" 0.041400 | \n",
"
\n",
" \n",
" 266 | \n",
" 0.064400 | \n",
"
\n",
" \n",
" 267 | \n",
" 0.053600 | \n",
"
\n",
" \n",
" 268 | \n",
" 0.040900 | \n",
"
\n",
" \n",
" 269 | \n",
" 0.040200 | \n",
"
\n",
" \n",
" 270 | \n",
" 0.053600 | \n",
"
\n",
" \n",
" 271 | \n",
" 0.033500 | \n",
"
\n",
" \n",
" 272 | \n",
" 0.033700 | \n",
"
\n",
" \n",
" 273 | \n",
" 0.040900 | \n",
"
\n",
" \n",
" 274 | \n",
" 0.105100 | \n",
"
\n",
" \n",
" 275 | \n",
" 0.026000 | \n",
"
\n",
" \n",
" 276 | \n",
" 0.023300 | \n",
"
\n",
" \n",
" 277 | \n",
" 0.117400 | \n",
"
\n",
" \n",
" 278 | \n",
" 0.046900 | \n",
"
\n",
" \n",
" 279 | \n",
" 0.064900 | \n",
"
\n",
" \n",
" 280 | \n",
" 0.027700 | \n",
"
\n",
" \n",
" 281 | \n",
" 0.044800 | \n",
"
\n",
" \n",
" 282 | \n",
" 0.063300 | \n",
"
\n",
" \n",
" 283 | \n",
" 0.032900 | \n",
"
\n",
" \n",
" 284 | \n",
" 0.028300 | \n",
"
\n",
" \n",
" 285 | \n",
" 0.027000 | \n",
"
\n",
" \n",
" 286 | \n",
" 0.044200 | \n",
"
\n",
" \n",
" 287 | \n",
" 0.056000 | \n",
"
\n",
" \n",
" 288 | \n",
" 0.023900 | \n",
"
\n",
" \n",
" 289 | \n",
" 0.094100 | \n",
"
\n",
" \n",
" 290 | \n",
" 0.018000 | \n",
"
\n",
" \n",
" 291 | \n",
" 0.059200 | \n",
"
\n",
" \n",
" 292 | \n",
" 0.058400 | \n",
"
\n",
" \n",
" 293 | \n",
" 0.040400 | \n",
"
\n",
" \n",
" 294 | \n",
" 0.025600 | \n",
"
\n",
" \n",
" 295 | \n",
" 0.015600 | \n",
"
\n",
" \n",
" 296 | \n",
" 0.065200 | \n",
"
\n",
" \n",
" 297 | \n",
" 0.029900 | \n",
"
\n",
" \n",
" 298 | \n",
" 0.025600 | \n",
"
\n",
" \n",
" 299 | \n",
" 0.014300 | \n",
"
\n",
" \n",
" 300 | \n",
" 0.062300 | \n",
"
\n",
" \n",
" 301 | \n",
" 0.017900 | \n",
"
\n",
" \n",
" 302 | \n",
" 0.047400 | \n",
"
\n",
" \n",
" 303 | \n",
" 0.084800 | \n",
"
\n",
" \n",
" 304 | \n",
" 0.053100 | \n",
"
\n",
" \n",
" 305 | \n",
" 0.027800 | \n",
"
\n",
" \n",
" 306 | \n",
" 0.018400 | \n",
"
\n",
" \n",
" 307 | \n",
" 0.021600 | \n",
"
\n",
" \n",
" 308 | \n",
" 0.070900 | \n",
"
\n",
" \n",
" 309 | \n",
" 0.060900 | \n",
"
\n",
" \n",
" 310 | \n",
" 0.055100 | \n",
"
\n",
" \n",
" 311 | \n",
" 0.060300 | \n",
"
\n",
" \n",
" 312 | \n",
" 0.079800 | \n",
"
\n",
" \n",
" 313 | \n",
" 0.072400 | \n",
"
\n",
" \n",
" 314 | \n",
" 0.063500 | \n",
"
\n",
" \n",
" 315 | \n",
" 0.036100 | \n",
"
\n",
" \n",
" 316 | \n",
" 0.034600 | \n",
"
\n",
" \n",
" 317 | \n",
" 0.009800 | \n",
"
\n",
" \n",
" 318 | \n",
" 0.036400 | \n",
"
\n",
" \n",
" 319 | \n",
" 0.063600 | \n",
"
\n",
" \n",
" 320 | \n",
" 0.045800 | \n",
"
\n",
" \n",
" 321 | \n",
" 0.042600 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"trainer_stats = trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "bf6a4048-6147-4f9d-ada5-cca176e82566",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2344.9026 seconds used for training.\n",
"39.08 minutes used for training.\n",
"Peak reserved memory = 10.463 GB.\n"
]
}
],
"source": [
"used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
"print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
"print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n",
"print(f\"Peak reserved memory = {used_memory} GB.\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c44c9b1f-c196-49aa-8bc4-f06216235503",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('Qwen2.5-7B-Instruct-hse_fine_tuned/tokenizer_config.json',\n",
" 'Qwen2.5-7B-Instruct-hse_fine_tuned/special_tokens_map.json',\n",
" 'Qwen2.5-7B-Instruct-hse_fine_tuned/vocab.json',\n",
" 'Qwen2.5-7B-Instruct-hse_fine_tuned/merges.txt',\n",
" 'Qwen2.5-7B-Instruct-hse_fine_tuned/added_tokens.json',\n",
" 'Qwen2.5-7B-Instruct-hse_fine_tuned/tokenizer.json')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.save_pretrained(\"Qwen2.5-7B-Instruct-hse_fine_tuned\")\n",
"tokenizer.save_pretrained(\"Qwen2.5-7B-Instruct-hse_fine_tuned\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "56cea5cc-c73f-4b27-9f3b-93367d0936dd",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"make: Entering directory '/home/ozaharov/hse_hackathon/llama.cpp'\n",
"I ccache not found. Consider installing it for faster compilation.\n",
"I llama.cpp build info: \n",
"I UNAME_S: Linux\n",
"I UNAME_P: x86_64\n",
"I UNAME_M: x86_64\n",
"I CFLAGS: -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion \n",
"I CXXFLAGS: -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE \n",
"I NVCCFLAGS: -std=c++11 -O3 -g \n",
"I LDFLAGS: \n",
"I CC: cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\n",
"I CXX: c++ (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\n",
"\n",
"rm -vrf *.dot libllava.a llama-baby-llama llama-batched llama-batched-bench llama-bench llama-cli llama-convert-llama2c-to-ggml llama-embedding llama-eval-callback llama-export-lora llama-gbnf-validator llama-gguf llama-gguf-hash llama-gguf-split llama-gritlm llama-imatrix llama-infill llama-llava-cli llama-minicpmv-cli llama-lookahead llama-lookup llama-lookup-create llama-lookup-merge llama-lookup-stats llama-parallel llama-passkey llama-perplexity llama-q8dot llama-quantize llama-quantize-stats llama-retrieval llama-save-load-state llama-server llama-simple llama-speculative llama-tokenize llama-vdot llama-cvector-generator llama-gen-docs tests/test-c.o tests/test-arg-parser tests/test-autorelease tests/test-backend-ops tests/test-chat-template tests/test-double-float tests/test-grad0 tests/test-grammar-integration tests/test-grammar-parser tests/test-json-schema-to-grammar tests/test-llama-grammar tests/test-log tests/test-model-load-cancel tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-rope tests/test-sampling tests/test-tokenizer-0 tests/test-tokenizer-1-bpe tests/test-tokenizer-1-spm\n",
"rm -rvf src/*.o\n",
"rm -rvf tests/*.o\n",
"rm -rvf examples/*.o\n",
"rm -rvf common/*.o\n",
"rm -rvf *.a\n",
"rm -rvf *.dll\n",
"rm -rvf *.so\n",
"rm -rvf *.dot\n",
"rm -rvf ggml/*.a\n",
"rm -rvf ggml/*.dll\n",
"rm -rvf ggml/*.so\n",
"rm -vrf ggml/src/*.o\n",
"rm -rvf ggml/src/llamafile/*.o\n",
"rm -rvf common/build-info.cpp\n",
"rm -vrf ggml/src/ggml-metal-embed.metal\n",
"rm -vrf ggml/src/ggml-cuda/*.o\n",
"rm -vrf ggml/src/ggml-cuda/template-instances/*.o\n",
"rm -rvf libllava.a llama-baby-llama llama-batched llama-batched-bench llama-bench llama-cli llama-convert-llama2c-to-ggml llama-embedding llama-eval-callback llama-export-lora llama-gbnf-validator llama-gguf llama-gguf-hash llama-gguf-split llama-gritlm llama-imatrix llama-infill llama-llava-cli llama-minicpmv-cli llama-lookahead llama-lookup llama-lookup-create llama-lookup-merge llama-lookup-stats llama-parallel llama-passkey llama-perplexity llama-q8dot llama-quantize llama-quantize-stats llama-retrieval llama-save-load-state llama-server llama-simple llama-speculative llama-tokenize llama-vdot llama-cvector-generator llama-gen-docs tests/test-c.o\n",
"rm -rvf tests/test-arg-parser tests/test-autorelease tests/test-backend-ops tests/test-chat-template tests/test-double-float tests/test-grad0 tests/test-grammar-integration tests/test-grammar-parser tests/test-json-schema-to-grammar tests/test-llama-grammar tests/test-log tests/test-model-load-cancel tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-rope tests/test-sampling tests/test-tokenizer-0 tests/test-tokenizer-1-bpe tests/test-tokenizer-1-spm\n",
"rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp\n",
"rm -rvf main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm\n",
"find examples pocs -type f -name \"*.o\" -delete\n",
"make: Leaving directory '/home/ozaharov/hse_hackathon/llama.cpp'\n",
"Unsloth: Merging 4bit and LoRA weights to 16bit...\n",
"Unsloth: Will use up to 305.42 out of 376.58 RAM for saving.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 28/28 [00:02<00:00, 11.87it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unsloth: Saving tokenizer... Done.\n",
"Unsloth: Saving model... This might take 5 minutes for Llama-7b...\n",
"Done.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Unsloth: Converting qwen2 model. Can use fast conversion = False.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"==((====))== Unsloth: Conversion from QLoRA to GGUF information\n",
" \\\\ /| [0] Installing llama.cpp will take 3 minutes.\n",
"O^O/ \\_/ \\ [1] Converting HF to GGUF 16bits will take 3 minutes.\n",
"\\ / [2] Converting GGUF 16bits to ['f16'] will take 10 minutes each.\n",
" \"-____-\" In total, you will have to wait at least 16 minutes.\n",
"\n",
"Unsloth: [0] Installing llama.cpp. This will take 3 minutes...\n",
"Unsloth: [1] Converting model at Qwen2.5-7B-Instruct-hse_fine_tuned into f16 GGUF format.\n",
"The output location will be /home/ozaharov/hse_hackathon/Qwen2.5-7B-Instruct-hse_fine_tuned/unsloth.F16.gguf\n",
"This will take 3 minutes...\n",
"INFO:hf-to-gguf:Loading model: Qwen2.5-7B-Instruct-hse_fine_tuned\n",
"INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only\n",
"INFO:hf-to-gguf:Exporting model...\n",
"INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'\n",
"INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'\n",
"INFO:hf-to-gguf:token_embd.weight, torch.float16 --> F16, shape = {3584, 152064}\n",
"INFO:hf-to-gguf:blk.0.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.0.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.0.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.0.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.0.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.0.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.0.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.0.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.0.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.0.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.0.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.0.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.1.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.1.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.1.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.1.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.1.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.1.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.1.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.1.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.1.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.1.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.1.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.1.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.2.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.2.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.2.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.2.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.2.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.2.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.2.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.2.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.2.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.2.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.2.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.2.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.3.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.3.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.3.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.3.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.3.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.3.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.3.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.3.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.3.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.3.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.3.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.3.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.4.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.4.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.4.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.4.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.4.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.4.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.4.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.4.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.4.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.4.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.4.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.4.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.5.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.5.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.5.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.5.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.5.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.5.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.5.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.5.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.5.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.5.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.5.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.5.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.6.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.6.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.6.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.6.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.6.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.6.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.6.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.6.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.6.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.6.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.6.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.6.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.7.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.7.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.7.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.7.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.7.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.7.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.7.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.7.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.7.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.7.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.7.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.7.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.8.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.8.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.8.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.8.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.8.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.8.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.8.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:gguf: loading model part 'model-00002-of-00004.safetensors'\n",
"INFO:hf-to-gguf:blk.10.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.10.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.10.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.10.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.10.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.10.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.10.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.10.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.10.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.10.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.10.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.10.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.11.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.11.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.11.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.11.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.11.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.11.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.11.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.11.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.11.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.11.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.11.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.11.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.12.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.12.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.12.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.12.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.12.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.12.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.12.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.12.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.12.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.12.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.12.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.12.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.13.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.13.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.13.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.13.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.13.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.13.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.13.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.13.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.13.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.13.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.13.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.13.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.14.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.14.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.14.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.14.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.14.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.14.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.14.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.14.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.14.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.14.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.14.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.14.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.15.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.15.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.15.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.15.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.15.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.15.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.15.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.15.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.15.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.15.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.15.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.15.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.16.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.16.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.16.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.16.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.16.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.16.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.16.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.16.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.16.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.16.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.16.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.16.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.17.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.17.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.17.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.17.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.17.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.17.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.17.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.17.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.17.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.17.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.17.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.17.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.18.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.18.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.18.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.18.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.18.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.18.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.18.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.18.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.18.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.8.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.8.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.8.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.8.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.8.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.9.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.9.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.9.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.9.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.9.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.9.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.9.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.9.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.9.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.9.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.9.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.9.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:gguf: loading model part 'model-00003-of-00004.safetensors'\n",
"INFO:hf-to-gguf:blk.18.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.18.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.18.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.19.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.19.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.19.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.19.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.19.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.19.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.19.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.19.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.19.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.19.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.19.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.19.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.20.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.20.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.20.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.20.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.20.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.20.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.20.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.20.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.20.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.20.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.20.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.20.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.21.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.21.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.21.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.21.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.21.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.21.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.21.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.21.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.21.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.21.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.21.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.21.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.22.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.22.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.22.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.22.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.22.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.22.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.22.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.22.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.22.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.22.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.22.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.22.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.23.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.23.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.23.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.23.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.23.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.23.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.23.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.23.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.23.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.23.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.23.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.23.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.24.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.24.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.24.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.24.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.24.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.24.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.24.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.24.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.24.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.24.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.24.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.24.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.25.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.25.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.25.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.25.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.25.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.25.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.25.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.25.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.25.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.25.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.25.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.25.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.26.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.26.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.26.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.26.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.26.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.26.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.26.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.26.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.26.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.26.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.26.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.26.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.27.attn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.27.ffn_down.weight, torch.float16 --> F16, shape = {18944, 3584}\n",
"INFO:hf-to-gguf:blk.27.ffn_gate.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.27.ffn_up.weight, torch.float16 --> F16, shape = {3584, 18944}\n",
"INFO:hf-to-gguf:blk.27.ffn_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.27.attn_k.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.27.attn_k.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:blk.27.attn_output.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.27.attn_q.bias, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:blk.27.attn_q.weight, torch.float16 --> F16, shape = {3584, 3584}\n",
"INFO:hf-to-gguf:blk.27.attn_v.bias, torch.float16 --> F32, shape = {512}\n",
"INFO:hf-to-gguf:blk.27.attn_v.weight, torch.float16 --> F16, shape = {3584, 512}\n",
"INFO:hf-to-gguf:output_norm.weight, torch.float16 --> F32, shape = {3584}\n",
"INFO:hf-to-gguf:gguf: loading model part 'model-00004-of-00004.safetensors'\n",
"INFO:hf-to-gguf:output.weight, torch.float16 --> F16, shape = {3584, 152064}\n",
"INFO:hf-to-gguf:Set meta model\n",
"INFO:hf-to-gguf:Set model parameters\n",
"INFO:hf-to-gguf:gguf: context length = 32768\n",
"INFO:hf-to-gguf:gguf: embedding length = 3584\n",
"INFO:hf-to-gguf:gguf: feed forward length = 18944\n",
"INFO:hf-to-gguf:gguf: head count = 28\n",
"INFO:hf-to-gguf:gguf: key-value head count = 4\n",
"INFO:hf-to-gguf:gguf: rope theta = 1000000.0\n",
"INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-06\n",
"INFO:hf-to-gguf:gguf: file type = 1\n",
"INFO:hf-to-gguf:Set model tokenizer\n",
"INFO:gguf.vocab:Adding 151387 merge(s).\n",
"INFO:gguf.vocab:Setting special token type eos to 151645\n",
"INFO:gguf.vocab:Setting special token type pad to 151665\n",
"INFO:gguf.vocab:Setting special token type bos to 151643\n",
"INFO:gguf.vocab:Setting add_bos_token to False\n",
"INFO:gguf.vocab:Setting chat_template to {%- if tools %}\n",
" {{- '<|im_start|>system\\n' }}\n",
" {%- if messages[0]['role'] == 'system' %}\n",
" {{- messages[0]['content'] }}\n",
" {%- else %}\n",
" {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n",
" {%- endif %}\n",
" {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n",
" {%- for tool in tools %}\n",
" {{- \"\\n\" }}\n",
" {{- tool | tojson }}\n",
" {%- endfor %}\n",
" {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n",
"{%- else %}\n",
" {%- if messages[0]['role'] == 'system' %}\n",
" {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n",
" {%- else %}\n",
" {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n",
" {%- endif %}\n",
"{%- endif %}\n",
"{%- for message in messages %}\n",
" {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n",
" {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n",
" {%- elif message.role == \"assistant\" %}\n",
" {{- '<|im_start|>' + message.role }}\n",
" {%- if message.content %}\n",
" {{- '\\n' + message.content }}\n",
" {%- endif %}\n",
" {%- for tool_call in message.tool_calls %}\n",
" {%- if tool_call.function is defined %}\n",
" {%- set tool_call = tool_call.function %}\n",
" {%- endif %}\n",
" {{- '\\n\\n{\"name\": \"' }}\n",
" {{- tool_call.name }}\n",
" {{- '\", \"arguments\": ' }}\n",
" {{- tool_call.arguments | tojson }}\n",
" {{- '}\\n' }}\n",
" {%- endfor %}\n",
" {{- '<|im_end|>\\n' }}\n",
" {%- elif message.role == \"tool\" %}\n",
" {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n",
" {{- '<|im_start|>user' }}\n",
" {%- endif %}\n",
" {{- '\\n\\n' }}\n",
" {{- message.content }}\n",
" {{- '\\n' }}\n",
" {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n",
" {{- '<|im_end|>\\n' }}\n",
" {%- endif %}\n",
" {%- endif %}\n",
"{%- endfor %}\n",
"{%- if add_generation_prompt %}\n",
" {{- '<|im_start|>assistant\\n' }}\n",
"{%- endif %}\n",
"\n",
"INFO:hf-to-gguf:Set model quantization version\n",
"INFO:gguf.gguf_writer:Writing the following files:\n",
"INFO:gguf.gguf_writer:/home/ozaharov/hse_hackathon/Qwen2.5-7B-Instruct-hse_fine_tuned/unsloth.F16.gguf: n_tensors = 339, total_size = 15.2G\n",
"Writing: 100%|██████████| 15.2G/15.2G [00:11<00:00, 1.35Gbyte/s]\n",
"INFO:hf-to-gguf:Model successfully exported to /home/ozaharov/hse_hackathon/Qwen2.5-7B-Instruct-hse_fine_tuned/unsloth.F16.gguf\n",
"Unsloth: Conversion completed! Output location: /home/ozaharov/hse_hackathon/Qwen2.5-7B-Instruct-hse_fine_tuned/unsloth.F16.gguf\n"
]
}
],
"source": [
"model.save_pretrained_gguf(\"Qwen2.5-7B-Instruct-hse_fine_tuned\", tokenizer, quantization_method=\"not_quantized\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8244803d-19e8-4187-977c-4b1c35dec999",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:.conda-unsloth]",
"language": "python",
"name": "conda-env-.conda-unsloth-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}