Human-1 / hindi_moshi_architecture.svg
bhaskarbuilds's picture
Upload hindi_moshi_architecture.svg
bd59300 verified
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 720 420" font-family="Arial, Helvetica, sans-serif">
<defs>
<marker id="arr" markerWidth="8" markerHeight="6" refX="8" refY="3" orient="auto">
<path d="M0,0 L8,3 L0,6 Z" fill="#555"/>
</marker>
</defs>
<!-- Background -->
<rect width="720" height="420" fill="white"/>
<!-- Legend -->
<rect x="20" y="12" width="14" height="14" rx="2" fill="#4A90D9"/>
<text x="40" y="24" font-size="11" fill="#333">Frozen</text>
<rect x="110" y="12" width="14" height="14" rx="2" fill="#7F8C8D"/>
<text x="130" y="24" font-size="11" fill="#333">Retained</text>
<rect x="220" y="12" width="14" height="14" rx="2" fill="#E74C3C"/>
<text x="240" y="24" font-size="11" fill="#333">Reinitialised</text>
<rect x="350" y="12" width="14" height="14" rx="2" fill="#27AE60"/>
<text x="370" y="24" font-size="11" fill="#333">New (Hindi)</text>
<!-- ======== LEFT: Audio I/O ======== -->
<rect x="20" y="130" width="90" height="36" rx="5" fill="#F5F5F5" stroke="#BBB"/>
<text x="65" y="153" text-anchor="middle" font-size="11" fill="#333">User Audio</text>
<rect x="20" y="250" width="90" height="36" rx="5" fill="#F5F5F5" stroke="#BBB"/>
<text x="65" y="273" text-anchor="middle" font-size="11" fill="#333">Moshi Audio</text>
<!-- ======== MIMI ======== -->
<rect x="140" y="120" width="80" height="56" rx="5" fill="#4A90D9"/>
<text x="180" y="146" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Mimi</text>
<text x="180" y="162" text-anchor="middle" font-size="9" fill="#D6EAF8">Encoder</text>
<rect x="140" y="240" width="80" height="56" rx="5" fill="#4A90D9"/>
<text x="180" y="266" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Mimi</text>
<text x="180" y="282" text-anchor="middle" font-size="9" fill="#D6EAF8">Decoder</text>
<text x="180" y="315" text-anchor="middle" font-size="8" fill="#4A90D9" font-style="italic">frozen</text>
<!-- Arrows: Audio ↔ Mimi -->
<line x1="110" y1="148" x2="138" y2="148" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>
<line x1="138" y1="268" x2="110" y2="268" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>
<!-- ======== Token labels ======== -->
<text x="237" y="142" font-size="9" fill="#888">Audio tokens</text>
<!-- Arrow: Mimi Encoder → Temporal Transformer -->
<line x1="220" y1="148" x2="290" y2="148" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>
<!-- ======== TEMPORAL TRANSFORMER ======== -->
<rect x="292" y="55" width="200" height="175" rx="6" fill="#FAFAFA" stroke="#AAA" stroke-width="1.2"/>
<text x="392" y="75" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">Temporal Transformer</text>
<text x="392" y="90" text-anchor="middle" font-size="9" fill="#999">7B Language Model</text>
<rect x="305" y="100" width="174" height="28" rx="4" fill="#7F8C8D"/>
<text x="392" y="119" text-anchor="middle" font-size="10" fill="white">Self-Attention Layers</text>
<rect x="305" y="135" width="174" height="28" rx="4" fill="#E74C3C"/>
<text x="392" y="154" text-anchor="middle" font-size="10" fill="white">Text Embeddings ✦</text>
<rect x="305" y="170" width="174" height="28" rx="4" fill="#7F8C8D"/>
<text x="392" y="189" text-anchor="middle" font-size="10" fill="white">Audio Embeddings</text>
<!-- z_s -->
<rect x="362" y="210" width="60" height="22" rx="3" fill="#F0E6FF" stroke="#8E44AD"/>
<text x="392" y="225" text-anchor="middle" font-size="10" font-weight="bold" fill="#8E44AD">z_s</text>
<!-- ======== DEPTH TRANSFORMER ======== -->
<rect x="292" y="250" width="200" height="145" rx="6" fill="#FAFAFA" stroke="#AAA" stroke-width="1.2"/>
<text x="392" y="270" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">Depth Transformer</text>
<rect x="305" y="280" width="174" height="28" rx="4" fill="#7F8C8D"/>
<text x="392" y="299" text-anchor="middle" font-size="10" fill="white">Causal Self-Attention</text>
<rect x="305" y="315" width="174" height="28" rx="4" fill="#E74C3C"/>
<text x="392" y="334" text-anchor="middle" font-size="10" fill="white">Text Embeddings ✦</text>
<rect x="305" y="350" width="174" height="28" rx="4" fill="#7F8C8D"/>
<text x="392" y="369" text-anchor="middle" font-size="10" fill="white">Audio Embeddings</text>
<!-- Arrow: z_s → Depth -->
<line x1="392" y1="232" x2="392" y2="248" stroke="#8E44AD" stroke-width="1.2" marker-end="url(#arr)"/>
<!-- ======== TEXT LINEAR ======== -->
<rect x="520" y="100" width="110" height="38" rx="5" fill="#E74C3C"/>
<text x="575" y="124" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Text Linear ✦</text>
<!-- Arrow: Temporal → Text Linear -->
<line x1="492" y1="119" x2="518" y2="119" stroke="#E74C3C" stroke-width="1.2" marker-end="url(#arr)"/>
<!-- Hindi text output -->
<rect x="530" y="150" width="90" height="26" rx="4" fill="#FDEDEC" stroke="#E74C3C"/>
<text x="575" y="168" text-anchor="middle" font-size="9.5" fill="#C0392B">Hindi Text</text>
<line x1="575" y1="138" x2="575" y2="148" stroke="#E74C3C" stroke-width="1" marker-end="url(#arr)"/>
<!-- ======== HINDI TOKENISER ======== -->
<rect x="520" y="195" width="160" height="40" rx="5" fill="#27AE60"/>
<text x="600" y="214" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Hindi SentencePiece ★</text>
<text x="600" y="228" text-anchor="middle" font-size="9" fill="#D5F5E3">Hindi vocabulary</text>
<!-- Dashed arrow: tokeniser feeds embeddings -->
<path d="M 520 215 Q 500 180 480 154" fill="none" stroke="#27AE60" stroke-width="1.2" stroke-dasharray="4,3" marker-end="url(#arr)"/>
<!-- ======== AUDIO OUTPUT ======== -->
<rect x="520" y="310" width="110" height="38" rx="5" fill="#EBF5FB" stroke="#5DADE2"/>
<text x="575" y="334" text-anchor="middle" font-size="10" font-weight="bold" fill="#333">Audio Tokens</text>
<!-- Arrow: Depth → Audio output -->
<line x1="492" y1="329" x2="518" y2="329" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>
<!-- Arrow: 16 Audio Tokens → split: 8 Moshi go to Mimi Decoder (via left arrow), 8 User for next step -->
<path d="M 520 340 Q 400 380 222 275" fill="none" stroke="#5DADE2" stroke-width="1.2" stroke-dasharray="4,3" marker-end="url(#arr)"/>
<!-- ======== FOOTNOTES ======== -->
<text x="20" y="400" font-size="9" fill="#C0392B">✦ Reinitialised for Hindi</text>
<text x="20" y="414" font-size="9" fill="#27AE60">★ New component</text>
</svg>