🐦 Twitter Post Details

Viewing enriched Twitter post
@rasbt

Just read through the Gemma 3 report and toyed around with the models a bit, and there are a bunch of interesting tidbits: 1. Vocab size. They again use a very large vocab: 262k token (in contrast, Llama 3 has ~1/2 the vocab size), which should make the model more friendly for… https://t.co/EiCOIw3IyJ
View on Twitter
🔧 Raw API Response

{
  "tweet": {
    "bookmark_count": 221,
    "bookmarked": false,
    "created_at": "Thu Mar 13 15:55:28 +0000 2025",
    "conversation_id_str": "1900214135847039316",
    "display_text_range": [
      0,
      279
    ],
    "entities": {
      "hashtags": [],
      "media": [
        {
          "display_url": "pic.x.com/EiCOIw3IyJ",
          "expanded_url": "https://x.com/rasbt/status/1900214135847039316/photo/1",
          "id_str": "1900213898080358400",
          "indices": [
            280,
            303
          ],
          "media_key": "3_1900213898080358400",
          "media_url_https": "https://pbs.twimg.com/media/Gl7qePhXEAAT_Qv.jpg",
          "type": "photo",
          "url": "https://t.co/EiCOIw3IyJ",
          "ext_media_availability": {
            "status": "Available"
          },
          "features": {
            "large": {
              "faces": []
            },
            "medium": {
              "faces": []
            },
            "small": {
              "faces": []
            },
            "orig": {
              "faces": []
            }
          },
          "sizes": {
            "large": {
              "h": 2048,
              "w": 1537,
              "resize": "fit"
            },
            "medium": {
              "h": 1200,
              "w": 901,
              "resize": "fit"
            },
            "small": {
              "h": 680,
              "w": 510,
              "resize": "fit"
            },
            "thumb": {
              "h": 150,
              "w": 150,
              "resize": "crop"
            }
          },
          "original_info": {
            "height": 2694,
            "width": 2022,
            "focus_rects": [
              {
                "x": 0,
                "y": 578,
                "w": 2022,
                "h": 1132
              },
              {
                "x": 0,
                "y": 133,
                "w": 2022,
                "h": 2022
              },
              {
                "x": 0,
                "y": 0,
                "w": 2022,
                "h": 2305
              },
              {
                "x": 0,
                "y": 0,
                "w": 1347,
                "h": 2694
              },
              {
                "x": 0,
                "y": 0,
                "w": 2022,
                "h": 2694
              }
            ]
          },
          "allow_download_status": {
            "allow_download": true
          },
          "media_results": {
            "result": {
              "media_key": "3_1900213898080358400"
            }
          }
        }
      ],
      "symbols": [],
      "timestamps": [],
      "urls": [],
      "user_mentions": []
    },
    "extended_entities": {
      "media": [
        {
          "display_url": "pic.x.com/EiCOIw3IyJ",
          "expanded_url": "https://x.com/rasbt/status/1900214135847039316/photo/1",
          "id_str": "1900213898080358400",
          "indices": [
            280,
            303
          ],
          "media_key": "3_1900213898080358400",
          "media_url_https": "https://pbs.twimg.com/media/Gl7qePhXEAAT_Qv.jpg",
          "type": "photo",
          "url": "https://t.co/EiCOIw3IyJ",
          "ext_media_availability": {
            "status": "Available"
          },
          "features": {
            "large": {
              "faces": []
            },
            "medium": {
              "faces": []
            },
            "small": {
              "faces": []
            },
            "orig": {
              "faces": []
            }
          },
          "sizes": {
            "large": {
              "h": 2048,
              "w": 1537,
              "resize": "fit"
            },
            "medium": {
              "h": 1200,
              "w": 901,
              "resize": "fit"
            },
            "small": {
              "h": 680,
              "w": 510,
              "resize": "fit"
            },
            "thumb": {
              "h": 150,
              "w": 150,
              "resize": "crop"
            }
          },
          "original_info": {
            "height": 2694,
            "width": 2022,
            "focus_rects": [
              {
                "x": 0,
                "y": 578,
                "w": 2022,
                "h": 1132
              },
              {
                "x": 0,
                "y": 133,
                "w": 2022,
                "h": 2022
              },
              {
                "x": 0,
                "y": 0,
                "w": 2022,
                "h": 2305
              },
              {
                "x": 0,
                "y": 0,
                "w": 1347,
                "h": 2694
              },
              {
                "x": 0,
                "y": 0,
                "w": 2022,
                "h": 2694
              }
            ]
          },
          "allow_download_status": {
            "allow_download": true
          },
          "media_results": {
            "result": {
              "media_key": "3_1900213898080358400"
            }
          }
        }
      ]
    },
    "favorite_count": 455,
    "favorited": false,
    "full_text": "Just read through the Gemma 3 report and toyed around with the models a bit, and there are a bunch of interesting tidbits:\n\n1. Vocab size. They again use a very large vocab: 262k token (in contrast, Llama 3 has ~1/2 the vocab size), which should make the model more friendly for… https://t.co/EiCOIw3IyJ",
    "is_quote_status": false,
    "lang": "en",
    "possibly_sensitive": false,
    "possibly_sensitive_editable": true,
    "quote_count": 5,
    "reply_count": 9,
    "retweet_count": 71,
    "retweeted": false,
    "user_id_str": "865622395",
    "id_str": "1900214135847039316",
    "note_tweet": {
      "is_expandable": true,
      "note_tweet_results": {
        "result": {
          "id": "Tm90ZVR3ZWV0OjE5MDAyMTQxMzU2MzMwODg1MTI=",
          "text": "Just read through the Gemma 3 report and toyed around with the models a bit, and there are a bunch of interesting tidbits:\n\n1. Vocab size. They again use a very large vocab: 262k token (in contrast, Llama 3 has ~1/2 the vocab size), which should make the model more friendly for further pre-training on non-English texts.\n\n2. Context window. They finally support larger contexts, up to 128k like Llama 3 (up from 8k in Gemma 2).\n\n3. Multimodal support. They added multimodal support (but the implementation is a topic for another time)\n\n4. Training set size. The training set size is 14T tokens for their 27B model, which is in a similar ballpark as Llama 3 (15T) and DeepSeek v3 (14.8 T). What's interesting is that the smaller models were trained on fewer tokens: 12T for the 12B version, 4T for the 4B version, and 2T for the 1B version. They probably adhered to their flavor of scaling laws here (but interestingly, in contrast, Llama 3.2 1B to 405B was trained on the same number of tokens as far as I know, as the Llama team didn't observe any saturation, and \"more was better.\")\n\n5. Distillation. They use (a flavor of) proper distillation where they don't just train on synthetic data generated by the teacher model but also consider (a subset of) the logits of the teacher model.\n\n6. Post-training. For post-training, in addition to RL with human preferences, they also added the DeepSeek-R1-like verifiable rewards for code execution and math problems.\n\n7. Memory optimization. They use both grouped-query attention and sliding window attention to reduce memory requirements to compensate for the large vocab (similar to Gemma 2) but also now the larger context sizes.\n\nBut where Gemma 2 used sliding window attention in every other layer, they now have a 5:1 ratio, meaning there's only 1 full attention layer for every 5 sliding windows (local) attention layers; moreover, the sliding window size was reduced from 4096 (Gemma 2) to just 1028 (Gemma 3).\n\nAccording to the ablation studies, this reduction to save memory has almost no impact on the model performance.\n\nI tried out Gemma 3 1B vs Gemma 2 2B vs LLama 3.2 1B for different context lengths, and Gemma 3 (1B) seems to be the most memory-friendly model now if you stay below 40k input tokens.",
          "entity_set": {
            "hashtags": [],
            "symbols": [],
            "urls": [],
            "user_mentions": []
          },
          "richtext": {
            "richtext_tags": []
          },
          "media": {
            "inline_media": []
          }
        }
      }
    }
  },
  "user": {
    "__typename": "User",
    "id": "VXNlcjo4NjU2MjIzOTU=",
    "rest_id": "865622395",
    "affiliates_highlighted_label": {},
    "has_graduated_access": true,
    "is_blue_verified": true,
    "profile_image_shape": "Circle",
    "legacy": {
      "can_dm": false,
      "can_media_tag": true,
      "created_at": "Sun Oct 07 02:06:16 +0000 2012",
      "default_profile": false,
      "default_profile_image": false,
      "description": "ML/AI researcher & former stats professor turned LLM research engineer. Author of \"Build a Large Language Model From Scratch\" (https://t.co/O8LAAMRzzW).",
      "entities": {
        "description": {
          "urls": [
            {
              "display_url": "amzn.to/4fqvn0D",
              "expanded_url": "https://amzn.to/4fqvn0D",
              "url": "https://t.co/O8LAAMRzzW",
              "indices": [
                127,
                150
              ]
            }
          ]
        },
        "url": {
          "urls": [
            {
              "display_url": "sebastianraschka.com",
              "expanded_url": "https://sebastianraschka.com",
              "url": "https://t.co/HrtQQ5tgJl",
              "indices": [
                0,
                23
              ]
            }
          ]
        }
      },
      "fast_followers_count": 0,
      "favourites_count": 21591,
      "followers_count": 337070,
      "friends_count": 1013,
      "has_custom_timelines": true,
      "is_translator": false,
      "listed_count": 4153,
      "location": "",
      "media_count": 1917,
      "name": "Sebastian Raschka",
      "normal_followers_count": 337070,
      "pinned_tweet_ids_str": [
        "1913589458726690892"
      ],
      "possibly_sensitive": false,
      "profile_banner_url": "https://pbs.twimg.com/profile_banners/865622395/1742309979",
      "profile_image_url_https": "https://pbs.twimg.com/profile_images/1661187442043486209/a3E4t1eV_normal.jpg",
      "profile_interstitial_type": "",
      "screen_name": "rasbt",
      "statuses_count": 17742,
      "translator_type": "none",
      "url": "https://t.co/HrtQQ5tgJl",
      "verified": false,
      "want_retweets": false,
      "withheld_in_countries": []
    },
    "professional": {
      "rest_id": "1487642811856007168",
      "professional_type": "Creator",
      "category": [
        {
          "id": 713,
          "name": "Science & Technology",
          "icon_name": ""
        }
      ]
    },
    "tipjar_settings": {
      "is_enabled": true,
      "bitcoin_handle": "",
      "cash_app_handle": "SebastianRaschka",
      "ethereum_handle": "",
      "venmo_handle": "Sebastian-Raschka"
    }
  },
  "views": "88714"
}