{"corpus_id":268732732,"paper_sha":"b8f280d8bf685f8da7c83068e73f000528072d6b","doi":"10.48550/arXiv.2403.19647","arxiv_id":"2403.19647","pmid":null,"pmcid":null,"mag_id":null,"dblp_id":"journals/corr/abs-2403-19647","acl_id":null,"title":"Sparse Feature Circuits: Discovering and Editing Interpretable Causal Graphs in Language Models","year":2024,"publication_date":"2024-03-28","venue":"International Conference on Learning Representations","journal":{"name":"ArXiv","pages":null,"volume":"abs/2403.19647"},"journal_issn":null,"journal_title":null,"publication_types":["JournalArticle"],"pubmed_pub_types":null,"s2_fields_of_study":["Linguistics","Computer Science"],"reference_count":86,"citation_count":277,"influential_citation_count":36,"is_open_access":false,"arxiv_categories":["cs.LG","cs.AI","cs.CL"],"arxiv_license":"http://creativecommons.org/licenses/by/4.0/","arxiv_journal_ref":"International Conference on Learning Representations, 2025","mesh_headings":null,"chemicals":null,"comments_corrections":null,"source_flags":1,"s2_open_access_pdf_url":null,"s2_open_access_landing_url":null,"s2_open_access_license":null,"s2_open_access_status":null,"pmc_open_access_pdf_url":null,"pmc_open_access_landing_url":null,"pmc_open_access_license":null,"pmc_open_access_status":null,"unpaywall_open_access_pdf_url":null,"unpaywall_open_access_landing_url":null,"unpaywall_open_access_license":null,"unpaywall_open_access_status":null,"abstract":"We introduce methods for discovering and applying sparse feature circuits. These are causally implicated subnetworks of human-interpretable features for explaining language model behaviors. Circuits identified in prior work consist of polysemantic and difficult-to-interpret units like attention heads or neurons, rendering them unsuitable for many downstream applications. In contrast, sparse feature circuits enable detailed understanding of unanticipated mechanisms. Because they are based on fine-grained units, sparse feature circuits are useful for downstream tasks: We introduce SHIFT, where we improve the generalization of a classifier by ablating features that a human judges to be task-irrelevant. Finally, we demonstrate an entirely unsupervised and scalable interpretability pipeline by discovering thousands of sparse feature circuits for automatically discovered model behaviors.","claims":[{"public_id":"cl_0037a70add3d5655f7415dd3e9245754","status":"active","text":"A classifier's generalization can be improved by ablating features that a human judges to be task-irrelevant.","confidence":0.94,"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/claims/cl_0037a70add3d5655f7415dd3e9245754"},{"public_id":"cl_10646334d0abde43cde6ba2679311f09","status":"active","text":"An unsupervised and scalable interpretability pipeline can discover thousands of sparse feature circuits for automatically discovered model behaviors.","confidence":0.93,"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/claims/cl_10646334d0abde43cde6ba2679311f09"},{"public_id":"cl_96e5e261c3f4d5bba23c03b2055ba46b","status":"active","text":"Sparse feature circuits provide a causally grounded, human-interpretable alternative to circuits built from polysemantic attention heads or neurons.","confidence":0.95,"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/claims/cl_96e5e261c3f4d5bba23c03b2055ba46b"}],"concepts":[{"public_id":"co_1a5ea839ef7fa52b50de068b012cfa8a","status":"active","name":"polysemantic and difficult-to-interpret units","description":"Model components such as attention heads or neurons that encode multiple functions and are hard to interpret.","types":["model component"],"aliases":["polysemantic units","attention heads","neurons"],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_1a5ea839ef7fa52b50de068b012cfa8a"},{"public_id":"co_1d052b5988b8ef6b2438ebdcb0029ec8","status":"active","name":"language model behaviors","description":"The outputs, predictions, or internal behaviors exhibited by a language model.","types":["phenomenon"],"aliases":["LM behaviors"],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_1d052b5988b8ef6b2438ebdcb0029ec8"},{"public_id":"co_29a867b3c1de21d507fae5c27d5daee7","status":"active","name":"feature ablation","description":"The removal or deactivation of selected features in a model to test or alter their influence.","types":["intervention","method"],"aliases":["ablating features"],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_29a867b3c1de21d507fae5c27d5daee7"},{"public_id":"co_2e04362698dd27b2ef7481a8890eaef3","status":"active","name":"downstream applications","description":"Uses of an interpretability method beyond the initial analysis task.","types":["application context"],"aliases":[],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_2e04362698dd27b2ef7481a8890eaef3"},{"public_id":"co_3b855d3873f8a0e93ad4ddbb905b9765","status":"active","name":"sparse feature circuits","description":"Causally implicated subnetworks composed of human-interpretable features used to explain language model behavior.","types":["method","interpretability construct"],"aliases":[],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_3b855d3873f8a0e93ad4ddbb905b9765"},{"public_id":"co_917b7128ab4adbd4ef19602ca9f8f6f8","status":"active","name":"automatically discovered model behaviors","description":"Model behaviors identified by automated procedures rather than manual specification.","types":["behavior","phenomenon"],"aliases":[],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_917b7128ab4adbd4ef19602ca9f8f6f8"},{"public_id":"co_d3d4eba0dc0c243d80d92cbbbd4067eb","status":"active","name":"SHIFT","description":"A feature-ablation approach that uses human judgment about task relevance to improve classifier generalization.","types":["method"],"aliases":[],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_d3d4eba0dc0c243d80d92cbbbd4067eb"},{"public_id":"co_f2bfa51ab4a5ca42f13e725a064a5e2a","status":"active","name":"task-irrelevant features","description":"Features judged by a human to be unrelated to the target task.","types":["feature"],"aliases":[],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_f2bfa51ab4a5ca42f13e725a064a5e2a"},{"public_id":"co_f37c7437f11284f1be1cdb949acc325f","status":"active","name":"unsupervised and scalable interpretability pipeline","description":"An interpretability workflow that does not require labels and can be applied at large scale.","types":["pipeline","method"],"aliases":[],"contributors":[{"id":1,"public_id":"12632b8b5f","public_label":"Anonymous (12632b8b5f)","roles":["extraction"],"url":"https://sah.borca.ai/u/12632b8b5f"}],"url":"https://sah.borca.ai/concepts/co_f37c7437f11284f1be1cdb949acc325f"}],"external_ids":{"DOI":"10.48550/arXiv.2403.19647","ArXiv":"2403.19647","PubMed":null,"PubMedCentral":null,"MAG":null,"DBLP":"journals/corr/abs-2403-19647","ACL":null},"open_access":{"is_open_access":true,"pdf_url":"https://arxiv.org/pdf/2403.19647","landing_url":"https://arxiv.org/abs/2403.19647","source":"arxiv","pdf_url_source":"derived_arxiv","license":"http://creativecommons.org/licenses/by/4.0/","reason":null},"reference_availability":{"status":"available","references_indexed":true,"full_text_available":true,"full_text_source":"arxiv","count_basis":"semantic_scholar_metadata","extraction_status":"not_applicable","reason":null},"source":{"provider":"episteme2","base_corpus":"semantic_scholar_dump","freshness_mode":"unknown","basis":["semantic_scholar_metadata","postgres_metadata"],"limits":["paper metadata is based on indexed upstream scholarly datasets","claims and concepts are available only for extracted papers","absence of claims or concepts means no extracted graph data is available in this response"],"status":"available","degraded":false,"degraded_reasons":[],"diagnostics":{"status":"available","degraded":false,"degraded_reasons":[],"metadata_status":"available","graph_status":"available","abstract_status":"available"},"source_flags":1},"paper_id":631257,"paper_uid":"4190d09b-5b65-4a20-a62e-6f0036c35082","canonical_identity":{"paper_id":631257,"paper_uid":"4190d09b-5b65-4a20-a62e-6f0036c35082","identity_status":"available","lookup_basis":"semantic_scholar_external_id","compatibility_path":"corpus_id"},"url":"https://sah.borca.ai/papers/268732732"}