INDEX
    Explanations
    New Auto-Interp
    Negative Logits
    )");
    
    -1.44
    )";
    
    -1.27
     themſelves
    -1.25
    `;
    
    -1.24
     myſelf
    -1.24
    `,
    
    -1.23
    ]]
    
    -1.20
    makeConstraints
    -1.20
    "]);
    
    -1.18
    "]];
    -1.17
    POSITIVE LOGITS
     color
    1.45
    color
    1.44
     Color
    1.40
    Color
    1.37
     COLOR
    1.36
    COLOR
    1.36
     colors
    1.32
     Colors
    1.18
    colors
    1.13
     colores
    1.11
    Act Density 0.039%

    No Known Activations