File size: 6,581 Bytes
bd59300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 720 420" font-family="Arial, Helvetica, sans-serif">

  <defs>
    <marker id="arr" markerWidth="8" markerHeight="6" refX="8" refY="3" orient="auto">
      <path d="M0,0 L8,3 L0,6 Z" fill="#555"/>
    </marker>
  </defs>

  <!-- Background -->
  <rect width="720" height="420" fill="white"/>

  <!-- Legend -->
  <rect x="20" y="12" width="14" height="14" rx="2" fill="#4A90D9"/>
  <text x="40" y="24" font-size="11" fill="#333">Frozen</text>
  <rect x="110" y="12" width="14" height="14" rx="2" fill="#7F8C8D"/>
  <text x="130" y="24" font-size="11" fill="#333">Retained</text>
  <rect x="220" y="12" width="14" height="14" rx="2" fill="#E74C3C"/>
  <text x="240" y="24" font-size="11" fill="#333">Reinitialised</text>
  <rect x="350" y="12" width="14" height="14" rx="2" fill="#27AE60"/>
  <text x="370" y="24" font-size="11" fill="#333">New (Hindi)</text>

  <!-- ======== LEFT: Audio I/O ======== -->
  <rect x="20" y="130" width="90" height="36" rx="5" fill="#F5F5F5" stroke="#BBB"/>
  <text x="65" y="153" text-anchor="middle" font-size="11" fill="#333">User Audio</text>

  <rect x="20" y="250" width="90" height="36" rx="5" fill="#F5F5F5" stroke="#BBB"/>
  <text x="65" y="273" text-anchor="middle" font-size="11" fill="#333">Moshi Audio</text>

  <!-- ======== MIMI ======== -->
  <rect x="140" y="120" width="80" height="56" rx="5" fill="#4A90D9"/>
  <text x="180" y="146" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Mimi</text>
  <text x="180" y="162" text-anchor="middle" font-size="9" fill="#D6EAF8">Encoder</text>

  <rect x="140" y="240" width="80" height="56" rx="5" fill="#4A90D9"/>
  <text x="180" y="266" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Mimi</text>
  <text x="180" y="282" text-anchor="middle" font-size="9" fill="#D6EAF8">Decoder</text>

  <text x="180" y="315" text-anchor="middle" font-size="8" fill="#4A90D9" font-style="italic">frozen</text>

  <!-- Arrows: Audio ↔ Mimi -->
  <line x1="110" y1="148" x2="138" y2="148" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>
  <line x1="138" y1="268" x2="110" y2="268" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>

  <!-- ======== Token labels ======== -->
  <text x="237" y="142" font-size="9" fill="#888">Audio tokens</text>

  <!-- Arrow: Mimi Encoder → Temporal Transformer -->
  <line x1="220" y1="148" x2="290" y2="148" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>

  <!-- ======== TEMPORAL TRANSFORMER ======== -->
  <rect x="292" y="55" width="200" height="175" rx="6" fill="#FAFAFA" stroke="#AAA" stroke-width="1.2"/>
  <text x="392" y="75" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">Temporal Transformer</text>
  <text x="392" y="90" text-anchor="middle" font-size="9" fill="#999">7B Language Model</text>

  <rect x="305" y="100" width="174" height="28" rx="4" fill="#7F8C8D"/>
  <text x="392" y="119" text-anchor="middle" font-size="10" fill="white">Self-Attention Layers</text>

  <rect x="305" y="135" width="174" height="28" rx="4" fill="#E74C3C"/>
  <text x="392" y="154" text-anchor="middle" font-size="10" fill="white">Text Embeddings ✦</text>

  <rect x="305" y="170" width="174" height="28" rx="4" fill="#7F8C8D"/>
  <text x="392" y="189" text-anchor="middle" font-size="10" fill="white">Audio Embeddings</text>

  <!-- z_s -->
  <rect x="362" y="210" width="60" height="22" rx="3" fill="#F0E6FF" stroke="#8E44AD"/>
  <text x="392" y="225" text-anchor="middle" font-size="10" font-weight="bold" fill="#8E44AD">z_s</text>

  <!-- ======== DEPTH TRANSFORMER ======== -->
  <rect x="292" y="250" width="200" height="145" rx="6" fill="#FAFAFA" stroke="#AAA" stroke-width="1.2"/>
  <text x="392" y="270" text-anchor="middle" font-size="12" font-weight="bold" fill="#333">Depth Transformer</text>

  <rect x="305" y="280" width="174" height="28" rx="4" fill="#7F8C8D"/>
  <text x="392" y="299" text-anchor="middle" font-size="10" fill="white">Causal Self-Attention</text>

  <rect x="305" y="315" width="174" height="28" rx="4" fill="#E74C3C"/>
  <text x="392" y="334" text-anchor="middle" font-size="10" fill="white">Text Embeddings ✦</text>

  <rect x="305" y="350" width="174" height="28" rx="4" fill="#7F8C8D"/>
  <text x="392" y="369" text-anchor="middle" font-size="10" fill="white">Audio Embeddings</text>

  <!-- Arrow: z_s → Depth -->
  <line x1="392" y1="232" x2="392" y2="248" stroke="#8E44AD" stroke-width="1.2" marker-end="url(#arr)"/>

  <!-- ======== TEXT LINEAR ======== -->
  <rect x="520" y="100" width="110" height="38" rx="5" fill="#E74C3C"/>
  <text x="575" y="124" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Text Linear ✦</text>

  <!-- Arrow: Temporal → Text Linear -->
  <line x1="492" y1="119" x2="518" y2="119" stroke="#E74C3C" stroke-width="1.2" marker-end="url(#arr)"/>

  <!-- Hindi text output -->
  <rect x="530" y="150" width="90" height="26" rx="4" fill="#FDEDEC" stroke="#E74C3C"/>
  <text x="575" y="168" text-anchor="middle" font-size="9.5" fill="#C0392B">Hindi Text</text>

  <line x1="575" y1="138" x2="575" y2="148" stroke="#E74C3C" stroke-width="1" marker-end="url(#arr)"/>

  <!-- ======== HINDI TOKENISER ======== -->
  <rect x="520" y="195" width="160" height="40" rx="5" fill="#27AE60"/>
  <text x="600" y="214" text-anchor="middle" font-size="11" font-weight="bold" fill="white">Hindi SentencePiece ★</text>
  <text x="600" y="228" text-anchor="middle" font-size="9" fill="#D5F5E3">Hindi vocabulary</text>

  <!-- Dashed arrow: tokeniser feeds embeddings -->
  <path d="M 520 215 Q 500 180 480 154" fill="none" stroke="#27AE60" stroke-width="1.2" stroke-dasharray="4,3" marker-end="url(#arr)"/>

  <!-- ======== AUDIO OUTPUT ======== -->
  <rect x="520" y="310" width="110" height="38" rx="5" fill="#EBF5FB" stroke="#5DADE2"/>
  <text x="575" y="334" text-anchor="middle" font-size="10" font-weight="bold" fill="#333">Audio Tokens</text>

  <!-- Arrow: Depth → Audio output -->
  <line x1="492" y1="329" x2="518" y2="329" stroke="#555" stroke-width="1.2" marker-end="url(#arr)"/>

  <!-- Arrow: 16 Audio Tokens → split: 8 Moshi go to Mimi Decoder (via left arrow), 8 User for next step -->
  <path d="M 520 340 Q 400 380 222 275" fill="none" stroke="#5DADE2" stroke-width="1.2" stroke-dasharray="4,3" marker-end="url(#arr)"/>

  <!-- ======== FOOTNOTES ======== -->
  <text x="20" y="400" font-size="9" fill="#C0392B">✦ Reinitialised for Hindi</text>
  <text x="20" y="414" font-size="9" fill="#27AE60">★ New component</text>

</svg>