INDEX
    Explanations

    opening and closing parentheses in the text

    New Auto-Interp
    Negative Logits
    )))));
    -0.77
    }})
    -0.68
    )})
    -0.65
    )});
    -0.64
    }});
    -0.63
    ')))
    -0.62
    ))));
    -0.62
    "})
    -0.61
    _));
    -0.61
    "]));
    -0.60
    POSITIVE LOGITS
    (
    1.16
    ₁(
    0.73
    (\
    0.73
    ²(
    0.71
    __(
    0.71
     (
    0.70
    >(
    0.70
      (
    0.70
    (
    
    0.69
    !(
    0.69
    Act Density 1.160%

    No Known Activations