---
base_model:
- XiaomiMiMo/MiMo-VL-7B-RL-2508
library_name: transformers
---

Quantized with GPTQModel 4.0.0 dev with the following code:

<details>
  <summary>quantization code</summary>
  

  ```python
  import base64
  from io import BytesIO
  from random import seed, shuffle
  
  from datasets import concatenate_datasets, load_dataset
  from gptqmodel import GPTQModel, QuantizeConfig
  from transformers import AutoTokenizer
  
  seed(0)
  
  MODEL_ID = "XiaomiMiMo/MiMo-VL-7B-RL-2508"
  SAVE_DIR = "MiMo-VL-7B-RL-2508-gptq-q4"
  
  NUM_TEXT_SAMPLES = 128
  NUM_IMAGE_SAMPLES = 128
  MAX_TOKENS = 1024
  
  
  def encode_pil_to_data_uri(pil_image) -> str:
      buff = BytesIO()
      pil_image.save(buff, format="PNG")
      encoded = base64.b64encode(buff.getvalue()).decode("utf-8")
      return f"data:image;base64,{encoded}"
  
  
  def make_text_conversations(texts, tok, max_tokens=1024):
      convs = []
      for t in texts:
          if not isinstance(t, str):
              continue
          tt = t.strip()
          if not tt:
              continue
          ids = tok.encode(tt, add_special_tokens=False)[:max_tokens]
          if not ids:
              continue
          trunc = tok.decode(ids, skip_special_tokens=True)
          convs.append(
              [
                  {
                      "role": "user",
                      "content": [{"type": "text", "text": trunc}],
                  }
              ]
          )
      return convs
  
  
  def make_image_conversations(hf_dataset, num_samples=64):
      convs = []
      for ex in hf_dataset.select(range(min(num_samples, len(hf_dataset)))):
          data_uri = encode_pil_to_data_uri(ex["image"])
          convs.append(
              [
                  {
                      "role": "user",
                      "content": [
                          {"type": "image", "image": data_uri},
                          {"type": "text", "text": "What does the image show?"},
                      ],
                  }
              ]
          )
      return convs
  
  
  en_ds = load_dataset(
      "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split="train"
  ).shuffle(seed=0)
  es_ds = load_dataset(
      "allenai/c4", data_files="multilingual/c4-es.tfrecord-00001-of-02048.json.gz", split="train"
  ).shuffle(seed=0)
  
  texts = [x["text"] for x in concatenate_datasets([en_ds, es_ds])]
  texts = [t for t in texts if isinstance(t, str) and t.strip()]
  shuffle(texts)
  texts = texts[:NUM_TEXT_SAMPLES]
  
  tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
  text_conversations = make_text_conversations(texts, tok, max_tokens=MAX_TOKENS)
  
  img_ds = load_dataset("lmms-lab/flickr30k", split="test[:512]").shuffle(seed=42)
  image_conversations = make_image_conversations(img_ds, num_samples=NUM_IMAGE_SAMPLES)
  
  calibration_conversations = text_conversations + image_conversations
  shuffle(calibration_conversations)
  
  print(
      f"Prepared {len(text_conversations)} text-only and "
      f"{len(image_conversations)} image+text conversations "
      f"(total {len(calibration_conversations)})."
  )
  
  qconf = QuantizeConfig(
      bits=4,
      group_size=128,
      device="cuda:0",
      v2=False,  # v2 is giving much worse results
  )
  
  model = GPTQModel.load(MODEL_ID, qconf)
  
  model.quantize(
      calibration_conversations,
      batch_size=1,
  )
  
  model.save(SAVE_DIR)
  print(f"Saved quantized model to: {SAVE_DIR}")
  ```

</details>