[ { "@graph" : [ { "@id" : "https://arxiv.org/abs/2311.13171", "https://sense-nets.xyz/hasZoteroItemType" : [ { "@value" : "preprint" } ] }, { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#assertion", "http://purl.org/dc/terms/creator" : [ { "@id" : "https://w3id.org/np/RAoSadUw99CeqDlR2400018nqTzR_38fT86OrTzk16Vts" } ], "http://purl.org/spar/cito/discusses" : [ { "@id" : "https://arxiv.org/abs/2311.13171" }, { "@id" : "https://www.alphaxiv.org/pdf/2408.03092" }, { "@id" : "https://x.com/prateeky2806/status/1727589818618523783" } ], "http://purl.org/spar/cito/includesQuotationFrom" : [ { "@id" : "https://x.com/prateeky2806/status/1727589818618523783" } ], "http://www.w3.org/2000/01/rdf-schema#comment" : [ { "@value" : " Merging models trained for long with WIDEN\nWhen models were trained on a lot of data they diverged further from the baseline (e.g. in continual pretraining for additional languages), current merging methods underperform in this setting\n\nhttps://alphaxiv.org/pdf/2408.03092\n@AlibabaGroup https://twitter.com/LChoshen/status/1823002789217493392/photo/1\n\n How do you do that?\nLet's assume we update a matrix with a few models.\nPick a pretrained model and consider the rest of the models as diff from it (task vectors)\nNormalize the row of each model, separating the normalization factor (magnitude) and direction (row)\n\n Now we weigh every row by how much it changed (higher = better) and average all together \n+ some trick to sometimes keep the original weight so weights might not sum to 1.\nYou can see how this follows recent findings about direction and size (e.g. https://x.com/prateeky2806/status/1727589818618523783)\n\n While the results in \"just\" merging are not changing that much, merging with a continually trained model (Sailor) that added many languages look quite good! https://twitter.com/LChoshen/status/1823002796259791276/photo/1\n\n Criticism (@askalphaxiv didn't upload comment):\nThere is a vast overclaiming calling Sailor a different pretrained model.\nQuite complex, hard to know if it will generalize\nand they only show a specific model.\n\n" } ], "https://schema.org/keywords" : [ { "@value" : "Sailor" }, { "@value" : "WIDEN" }, { "@value" : "large-language-models" }, { "@value" : "model-merging" }, { "@value" : "weight-disentanglement" } ] }, { "@id" : "https://www.alphaxiv.org/pdf/2408.03092", "https://sense-nets.xyz/hasZoteroItemType" : [ { "@value" : "webpage" } ] }, { "@id" : "https://x.com/prateeky2806/status/1727589818618523783", "https://sense-nets.xyz/hasZoteroItemType" : [ { "@value" : "forumPost" } ] } ], "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#assertion" }, { "@graph" : [ { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0", "@type" : [ "http://www.nanopub.org/nschema#Nanopublication" ], "http://www.nanopub.org/nschema#hasAssertion" : [ { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#assertion" } ], "http://www.nanopub.org/nschema#hasProvenance" : [ { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#provenance" } ], "http://www.nanopub.org/nschema#hasPublicationInfo" : [ { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#pubinfo" } ] } ], "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#head" }, { "@graph" : [ { "@id" : "https://sense-nets.xyz/", "@type" : [ "http://www.w3.org/ns/prov#SoftwareAgent" ], "http://www.w3.org/ns/prov#actedOnBehalfOf" : [ { "@id" : "https://w3id.org/np/RAoSadUw99CeqDlR2400018nqTzR_38fT86OrTzk16Vts" } ] }, { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#activity", "@type" : [ "https://sense-nets.xyz/supervisedActivity" ], "http://www.w3.org/ns/prov#wasAssociatedWith" : [ { "@id" : "https://sense-nets.xyz/" } ] }, { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#assertion", "http://www.w3.org/ns/prov#linksTo" : [ { "@id" : "https://x.com/LChoshen/status/1823002789217493392" } ], "http://www.w3.org/ns/prov#wasAssociatedWith" : [ { "@id" : "https://x.com/LChoshen" } ], "http://www.w3.org/ns/prov#wasAttributedTo" : [ { "@id" : "https://orcid.org/0000-0002-0085-6496" }, { "@id" : "https://w3id.org/np/RAoSadUw99CeqDlR2400018nqTzR_38fT86OrTzk16Vts" } ], "http://www.w3.org/ns/prov#wasGeneratedBy" : [ { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#activity" } ] }, { "@id" : "https://w3id.org/np/RAoSadUw99CeqDlR2400018nqTzR_38fT86OrTzk16Vts", "http://xmlns.com/foaf/0.1/account" : [ { "@id" : "https://orcid.org/0000-0002-0085-6496" }, { "@id" : "https://x.com/LChoshen" } ] } ], "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#provenance" }, { "@graph" : [ { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0", "http://purl.org/dc/terms/created" : [ { "@type" : "http://www.w3.org/2001/XMLSchema#dateTime", "@value" : "2024-09-03T21:16:16.131Z" } ], "http://purl.org/dc/terms/creator" : [ { "@id" : "https://w3id.org/np/RAoSadUw99CeqDlR2400018nqTzR_38fT86OrTzk16Vts" } ], "http://purl.org/dc/terms/license" : [ { "@id" : "https://creativecommons.org/licenses/by/4.0/" } ], "http://purl.org/nanopub/x/hasNanopubType" : [ { "@id" : "https://sense-nets.xyz/SemanticPost" } ], "http://purl.org/nanopub/x/wasCreatedAt" : [ { "@id" : "https://sense-nets.xyz/" } ], "http://www.w3.org/2000/01/rdf-schema#label" : [ { "@value" : "CoSMO Semantic Post" } ], "http://www.w3.org/ns/prov#wasAttributedTo" : [ { "@id" : "https://orcid.org/0000-0002-0085-6496" } ], "https://sense-nets.xyz/hasRootSigner" : [ { "@value" : "0xf6ECcfD463afB464dcC85b051DF2E93E2646E6D2" } ] }, { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#sig", "http://purl.org/nanopub/x/hasAlgorithm" : [ { "@value" : "RSA" } ], "http://purl.org/nanopub/x/hasPublicKey" : [ { "@value" : "MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArHtI92jm8pAYVsvJabxLGfOT+7G0JyJGh2gwjB5x2pFPga6wWTd+rNBWWUZViIFnaJrBEsJpgdnoupLU9ppwn+khMiGRfxqGsDDzwHcj3Jc75CRys7d3etwXdBdoXfBgjsJiZBazwm13idr6tljRrC1TaEJBnRQAqzBw9cLDeGY77cSznzXT39feUGT168dpCSE9O6u/48DvvWVqciHGsH9cQ+LroJJVsMrorwtsdZnAK+q48wtIP6pIpw5shSJ5LnA0qeN/f4TvTFDV6ItYIXjiWWpTECc/Bxmfnyat3B5xWCu9nvz8fEs7Ns0TuzQwT3/K55iSKDEIi/E0nO97xwIDAQAB" } ], "http://purl.org/nanopub/x/hasSignature" : [ { "@value" : "BMCHmxj4685c4tB4MzssQlbmilVpyC5oQEPuiEqc4AHbLlU0uJStQhpua7d52ZKIDFMi9nmrvLJc7eFuYs6gyjJzve0WY5BNHdpurTkJeU3Tyh9G2vsmlVof2FQc6QaijFR5DFKECKems3CSMJuBxChDj+hqrjS6DloVTdEIEalSHXsOw0utP7P/ZZvdhvkTMYaPPhuJspFjyGYmfLVb/m+Gr2zlsQgXRxdS5qc8LvGdAAjRxS4LAwzk7rklJXEfyDEWZ+B9V5hPzsmmqb60iFPaA9PHyqFGUT+EP1WFyJdIVL5PS48izFWx0+KDaTH4Nm6JrQUSO8kNx348rgKYZA==" } ], "http://purl.org/nanopub/x/hasSignatureTarget" : [ { "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0" } ], "http://purl.org/nanopub/x/singedBy" : [ { "@id" : "https://sense-nets.xyz/" } ], "http://www.w3.org/ns/prov#wasAssociatedWith" : [ { "@id" : "https://w3id.org/np/RAoSadUw99CeqDlR2400018nqTzR_38fT86OrTzk16VtssigningDelegation" } ] }, { "@id" : "https://w3id.org/np/RAoSadUw99CeqDlR2400018nqTzR_38fT86OrTzk16Vts", "http://xmlns.com/foaf/0.1/account" : [ { "@id" : "https://orcid.org/0000-0002-0085-6496" } ], "http://xmlns.com/foaf/0.1/name" : [ { "@value" : "Leshem Choshen 🤖🤗 @ICML wanna talk?" } ] } ], "@id" : "https://w3id.org/np/RAPwHYQQtXh6p3DQQ066TmpKOBMIWkerAYv-chCViAqC0#pubinfo" } ]