🐦 Twitter Post Details

Viewing enriched Twitter post

@PyTorch

NCCL watchdog timeouts are some of the most commonly misunderstood and difficult-to-debug errors in AI training. In the past, PyTorch released Flight Recorder to make debugging these errors easier, but interpreting its outputs has been tricky. Based on an analysis of countless NCCL watchdog timeouts at Meta, the PyTorch team at Meta has put together a guide on how to effectively use Flight Recorder to debug NCCL watchdog timeouts, and to better understand why they occur (spoiler: it's usually not because of network or NCCL issues). If you've ever spent hours debugging these errors and are looking for a better solution, this blog post is for you: https://t.co/HIBdkYgpv8 ✍ Yifei Liu, Uttam Thakore, Ph.D., Junjie W., Yongzhong Yang #PyTorch #OpenSourceAI #NCCL #FlightRecorder

Media 1

πŸ“Š Media Metadata

{
  "media": [
    {
      "type": "photo",
      "url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/2037253905818456306/media_0.jpg",
      "filename": "media_0.jpg"
    }
  ],
  "processed_at": "2026-03-26T19:51:53.770925",
  "pipeline_version": "2.0"
}

πŸ”§ Raw API Response

{
  "type": "tweet",
  "id": "2037253905818456306",
  "url": "https://x.com/PyTorch/status/2037253905818456306",
  "twitterUrl": "https://twitter.com/PyTorch/status/2037253905818456306",
  "text": "NCCL watchdog timeouts are some of the most commonly misunderstood and difficult-to-debug errors in AI training. In the past, PyTorch released Flight Recorder to make debugging these errors easier, but interpreting its outputs has been tricky.\n\nBased on an analysis of countless NCCL watchdog timeouts at Meta, the PyTorch team at Meta has put together a guide on how to effectively use Flight Recorder to debug NCCL watchdog timeouts, and to better understand why they occur (spoiler: it's usually not because of network or NCCL issues).\n\nIf you've ever spent hours debugging these errors and are looking for a better solution, this blog post is for you: https://t.co/HIBdkYgpv8\n\n✍ Yifei Liu, Uttam Thakore, Ph.D., Junjie W., Yongzhong Yang\n\n#PyTorch #OpenSourceAI #NCCL #FlightRecorder",
  "source": "Twitter for iPhone",
  "retweetCount": 2,
  "replyCount": 0,
  "likeCount": 3,
  "quoteCount": 0,
  "viewCount": 416,
  "createdAt": "Thu Mar 26 19:42:36 +0000 2026",
  "lang": "en",
  "bookmarkCount": 2,
  "isReply": false,
  "inReplyToId": null,
  "conversationId": "2037253905818456306",
  "displayTextRange": [
    0,
    278
  ],
  "inReplyToUserId": null,
  "inReplyToUsername": null,
  "author": {
    "type": "user",
    "userName": "PyTorch",
    "url": "https://x.com/PyTorch",
    "twitterUrl": "https://twitter.com/PyTorch",
    "id": "776585502606721024",
    "name": "PyTorch",
    "isVerified": false,
    "isBlueVerified": true,
    "verifiedType": null,
    "profilePicture": "https://pbs.twimg.com/profile_images/1813965160702451712/yXV1vRhr_normal.jpg",
    "coverPicture": "https://pbs.twimg.com/profile_banners/776585502606721024/1761575044",
    "description": "",
    "location": "",
    "followers": 482248,
    "following": 83,
    "status": "",
    "canDm": false,
    "canMediaTag": true,
    "createdAt": "Fri Sep 16 00:56:26 +0000 2016",
    "entities": {
      "description": {
        "urls": []
      },
      "url": {}
    },
    "fastFollowersCount": 0,
    "favouritesCount": 860,
    "hasCustomTimelines": true,
    "isTranslator": false,
    "mediaCount": 1367,
    "statusesCount": 3121,
    "withheldInCountries": [],
    "affiliatesHighlightedLabel": {},
    "possiblySensitive": false,
    "pinnedTweetIds": [],
    "profile_bio": {
      "description": "Tensors and neural networks in Python with strong hardware acceleration. PyTorch is an open source project at the Linux Foundation. #PyTorchFoundation",
      "entities": {
        "description": {
          "hashtags": [
            {
              "indices": [
                132,
                150
              ],
              "text": "PyTorchFoundation"
            }
          ],
          "symbols": [],
          "urls": [],
          "user_mentions": []
        },
        "url": {
          "urls": [
            {
              "display_url": "pytorch.org",
              "expanded_url": "http://pytorch.org",
              "indices": [
                0,
                23
              ],
              "url": "https://t.co/6SwTBhUwTJ"
            }
          ]
        }
      }
    },
    "isAutomated": false,
    "automatedBy": null
  },
  "extendedEntities": {
    "media": [
      {
        "display_url": "pic.twitter.com/iXFS5yNXay",
        "expanded_url": "https://twitter.com/PyTorch/status/2037253905818456306/photo/1",
        "ext_media_availability": {
          "status": "Available"
        },
        "features": {
          "large": {
            "faces": []
          },
          "orig": {
            "faces": []
          }
        },
        "id_str": "2037253884746330112",
        "indices": [
          279,
          302
        ],
        "media_key": "3_2037253884746330112",
        "media_results": {
          "id": "QXBpTWVkaWFSZXN1bHRzOgwAAQoAARxFx51FlvAACgACHEXHoi2WEPIAAA==",
          "result": {
            "__typename": "ApiMedia",
            "id": "QXBpTWVkaWE6DAABCgABHEXHnUWW8AAKAAIcRceiLZYQ8gAA",
            "media_key": "3_2037253884746330112"
          }
        },
        "media_url_https": "https://pbs.twimg.com/media/HEXHnUWW8AAoMrs.jpg",
        "original_info": {
          "focus_rects": [
            {
              "h": 573,
              "w": 1024,
              "x": 0,
              "y": 0
            },
            {
              "h": 576,
              "w": 576,
              "x": 198,
              "y": 0
            },
            {
              "h": 576,
              "w": 505,
              "x": 234,
              "y": 0
            },
            {
              "h": 576,
              "w": 288,
              "x": 342,
              "y": 0
            },
            {
              "h": 576,
              "w": 1024,
              "x": 0,
              "y": 0
            }
          ],
          "height": 576,
          "width": 1024
        },
        "sizes": {
          "large": {
            "h": 576,
            "w": 1024
          }
        },
        "type": "photo",
        "url": "https://t.co/iXFS5yNXay"
      }
    ]
  },
  "card": null,
  "place": {},
  "entities": {
    "hashtags": [
      {
        "indices": [
          743,
          751
        ],
        "text": "PyTorch"
      },
      {
        "indices": [
          752,
          765
        ],
        "text": "OpenSourceAI"
      },
      {
        "indices": [
          766,
          771
        ],
        "text": "NCCL"
      },
      {
        "indices": [
          772,
          787
        ],
        "text": "FlightRecorder"
      }
    ],
    "symbols": [],
    "urls": [
      {
        "display_url": "pytorch.org/blog/flight-re…",
        "expanded_url": "https://pytorch.org/blog/flight-recorder-a-new-lens-for-understanding-nccl-watchdog-timeouts/",
        "indices": [
          656,
          679
        ],
        "url": "https://t.co/HIBdkYgpv8"
      }
    ],
    "user_mentions": []
  },
  "quoted_tweet": null,
  "retweeted_tweet": null,
  "isLimitedReply": false,
  "article": null
}