mediainbox
/

MiMo-VL-7B-RL-2508-gptq-q4

 library_name: transformers
 ---
+Quantized with GPTQModel 4.0.0 dev with the following code:
+<details>
+  <summary>quantization code</summary>
+  ```python
+  import base64
+  from io import BytesIO
+  from random import seed, shuffle
+  from datasets import concatenate_datasets, load_dataset
+  from gptqmodel import GPTQModel, QuantizeConfig
+  from transformers import AutoTokenizer
+  seed(0)
+  MODEL_ID = "XiaomiMiMo/MiMo-VL-7B-RL-2508"
+  SAVE_DIR = "MiMo-VL-7B-RL-2508-gptq-q4"
+  NUM_TEXT_SAMPLES = 128
+  NUM_IMAGE_SAMPLES = 128
+  MAX_TOKENS = 1024
+  def encode_pil_to_data_uri(pil_image) -> str:
+      buff = BytesIO()
+      pil_image.save(buff, format="PNG")
+      encoded = base64.b64encode(buff.getvalue()).decode("utf-8")
+      return f"data:image;base64,{encoded}"
+  def make_text_conversations(texts, tok, max_tokens=1024):
+      convs = []
+      for t in texts:
+          if not isinstance(t, str):
+              continue
+          tt = t.strip()
+          if not tt:
+              continue
+          ids = tok.encode(tt, add_special_tokens=False)[:max_tokens]
+          if not ids:
+              continue
+          trunc = tok.decode(ids, skip_special_tokens=True)
+          convs.append(
+              [
+                  {
+                      "role": "user",
+                      "content": [{"type": "text", "text": trunc}],
+                  }
+              ]
+          )
+      return convs
+  def make_image_conversations(hf_dataset, num_samples=64):
+      convs = []
+      for ex in hf_dataset.select(range(min(num_samples, len(hf_dataset)))):
+          data_uri = encode_pil_to_data_uri(ex["image"])
+          convs.append(
+              [
+                  {
+                      "role": "user",
+                      "content": [
+                          {"type": "image", "image": data_uri},
+                          {"type": "text", "text": "What does the image show?"},
+                      ],
+                  }
+              ]
+          )
+      return convs
+  en_ds = load_dataset(
+      "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split="train"
+  ).shuffle(seed=0)
+  es_ds = load_dataset(
+      "allenai/c4", data_files="multilingual/c4-es.tfrecord-00001-of-02048.json.gz", split="train"
+  ).shuffle(seed=0)
+  texts = [x["text"] for x in concatenate_datasets([en_ds, es_ds])]
+  texts = [t for t in texts if isinstance(t, str) and t.strip()]
+  shuffle(texts)
+  texts = texts[:NUM_TEXT_SAMPLES]
+  tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
+  text_conversations = make_text_conversations(texts, tok, max_tokens=MAX_TOKENS)
+  img_ds = load_dataset("lmms-lab/flickr30k", split="test[:512]").shuffle(seed=42)
+  image_conversations = make_image_conversations(img_ds, num_samples=NUM_IMAGE_SAMPLES)
+  calibration_conversations = text_conversations + image_conversations
+  shuffle(calibration_conversations)
+  print(
+      f"Prepared {len(text_conversations)} text-only and "
+      f"{len(image_conversations)} image+text conversations "
+      f"(total {len(calibration_conversations)})."
+  )
+  qconf = QuantizeConfig(
+      bits=4,
+      group_size=128,
+      device="cuda:0",
+      v2=False,  # v2 is giving much worse results
+  )
+  model = GPTQModel.load(MODEL_ID, qconf)
+  model.quantize(
+      calibration_conversations,
+      batch_size=1,
+  )
+  model.save(SAVE_DIR)
+  print(f"Saved quantized model to: {SAVE_DIR}")
+  ```
+</details>