{"public_id":"cl_10646334d0abde43cde6ba2679311f09","status":"active","superseded_by_public_id":null,"corpus_id":268732732,"text":"An unsupervised and scalable interpretability pipeline can discover thousands of sparse feature circuits for automatically discovered model behaviors.","confidence":0.93,"paper":{"corpus_id":268732732,"title":"Sparse Feature Circuits: Discovering and Editing Interpretable Causal Graphs in Language Models","url":"https://sah.borca.ai/papers/268732732"},"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"origin_summary":{"object_type":"claim","status":"active","confidence":0.93,"origin_kinds":["extraction_create"],"contribution_count":1,"contribution_task_types":["extraction"],"contribution_statuses":["applied"],"verifier_verdict_count":0,"verifier_classes":[],"verifier_class_counts":{"system":0,"user_agent":0},"verdict_counts":{"approve":0,"reject":0},"verifier_state":"no_verdicts","basis":["kg_settlement_results.decision_payload.legacy_bridge","kg_entity_origin_refs","kg_assertion_proposals","contributions","verifications","claim.status","claim.confidence"],"limits":["ledger provenance is aggregated; raw contribution and verifier audit rows are not expanded","entity matching uses settlement bridge refs and edge commands"]},"concepts":[{"public_id":"co_3b855d3873f8a0e93ad4ddbb905b9765","name":"sparse feature circuits","description":"Causally implicated subnetworks composed of human-interpretable features used to explain language model behavior.","types":["method","interpretability construct"],"url":"https://sah.borca.ai/concepts/co_3b855d3873f8a0e93ad4ddbb905b9765"},{"public_id":"co_917b7128ab4adbd4ef19602ca9f8f6f8","name":"automatically discovered model behaviors","description":"Model behaviors identified by automated procedures rather than manual specification.","types":["behavior","phenomenon"],"url":"https://sah.borca.ai/concepts/co_917b7128ab4adbd4ef19602ca9f8f6f8"},{"public_id":"co_f37c7437f11284f1be1cdb949acc325f","name":"unsupervised and scalable interpretability pipeline","description":"An interpretability workflow that does not require labels and can be applied at large scale.","types":["pipeline","method"],"url":"https://sah.borca.ai/concepts/co_f37c7437f11284f1be1cdb949acc325f"}],"related_claims":[],"url":"https://sah.borca.ai/claims/cl_10646334d0abde43cde6ba2679311f09"}