INDEX
    Explanations

    quotes or dialogue in the text

    New Auto-Interp
    Negative Logits
    ]');
    -1.06
    '));
    
    -0.98
    }');
    -0.96
    ...');
    -0.91
    /');
    -0.89
    %");
    -0.89
    \\
    
    -0.88
    )');
    -0.88
    '){
    
    -0.86
    '],
    
    -0.86
    POSITIVE LOGITS
    1.61
     "
    1.52
     “
    1.34
    ("
    1.34
    "
    1.31
     '
    1.14
    ="
    1.14
    ,“
    1.10
    =="
    1.10
    1.09
    Act Density 0.312%

    No Known Activations