INDEX
    Explanations

    quotation marks or dialogue indicators

    New Auto-Interp
    Negative Logits
    %");
    -1.11
    '));
    
    -1.04
    ]');
    -1.03
    }');
    -1.01
    )');
    -0.96
    )";
    
    -0.95
    ...');
    -0.94
    )");
    
    -0.94
    .";
    
    -0.93
    '):
    
    -0.92
    POSITIVE LOGITS
     "
    1.89
    1.80
    ("
    1.68
     “
    1.66
    ="
    1.51
    "
    1.50
     '
    1.43
    :@"
    1.35
    {"
    1.34
    ["
    1.31
    Act Density 0.525%

    No Known Activations