INDEX
    Explanations

    percentage placeholders in formatted strings

    New Auto-Interp
    Negative Logits
    )";
    
    -0.92
    )");
    
    -0.82
    ]";
    -0.82
    )');
    -0.78
    "])
    
    -0.77
    ."]
    -0.76
    ukone
    -0.74
    }';
    -0.74
    ]");
    -0.73
    )';
    -0.72
    POSITIVE LOGITS
    ("%
    1.25
    ('%
    1.17
    ,"%
    1.13
    ='%
    0.99
     "%
    0.98
    ="%
    0.98
    :%
    0.97
     '%
    0.93
     %
    0.90
    
    0.82
    Act Density 0.034%

    No Known Activations