diff --git a/README.md b/README.md index e6c3ac0..a94be97 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,19 @@ # AttentionXML [AttentionXML: Label Tree-based Attention-Aware Deep Model for High-Performance Extreme Multi-Label Text Classification](https://arxiv.org/abs/1811.01727) -## Requirements - -* python==3.7.4 -* click==7.0 -* ruamel.yaml==0.16.5 -* numpy==1.16.2 -* scipy==1.3.1 -* scikit-learn==0.21.2 -* gensim==3.4.0 -* torch==1.0.1 -* nltk==3.4 -* tqdm==4.31.1 -* joblib==0.13.2 -* logzero==1.5.0 +## Installation + +You can install AttentionXML by following commands: + +```bash +git clone +cd AttentionXML +conda create -n attentionxml python=3.13 +conda activate attentionxml +# 2025/12/16 the newest version of torch is 2.9.1, you can install it directly by: +pip3 install torch --index-url https://download.pytorch.org/whl/cu126 +pip install -r requirements.txt +``` ## Datasets diff --git a/deepxml/cluster.py b/deepxml/cluster.py index ab4bdd6..4c67473 100644 --- a/deepxml/cluster.py +++ b/deepxml/cluster.py @@ -29,7 +29,7 @@ def build_tree_by_level(sparse_data_x, sparse_data_y, mlb, eps: float, max_leaf: levels, q = [2**x for x in levels], None for i in range(len(levels)-1, -1, -1): if os.path.exists(F'{groups_path}-Level-{i}.npy'): - labels_list = np.load(F'{groups_path}-Level-{i}.npy') + labels_list = np.load(F'{groups_path}-Level-{i}.npy', allow_pickle=True) q = [(labels_i, labels_f[labels_i]) for labels_i in labels_list] break if q is None: diff --git a/deepxml/data_utils.py b/deepxml/data_utils.py index 610dbd8..d6066a9 100644 --- a/deepxml/data_utils.py +++ b/deepxml/data_utils.py @@ -43,13 +43,15 @@ def get_word_emb(vec_path, vocab_path=None): if vocab_path is not None: with open(vocab_path) as fp: vocab = {word: idx for idx, word in enumerate(fp)} - return np.load(vec_path), vocab + return np.load(vec_path, allow_pickle=True), vocab else: - return np.load(vec_path) + return np.load(vec_path, allow_pickle=True) def get_data(text_file, label_file=None): - return np.load(text_file), np.load(label_file) if label_file is not None else None + text_data = np.load(text_file, allow_pickle=True) + label_data = np.load(label_file, allow_pickle=True) if label_file is not None else None + return text_data, label_data def convert_to_binary(text_file, label_file=None, max_len=None, vocab=None, pad='', unknown=''): @@ -74,6 +76,11 @@ def truncate_text(texts, max_len=500, padding_idx=0, unknown_idx=1): def get_mlb(mlb_path, labels=None) -> MultiLabelBinarizer: if os.path.exists(mlb_path): + # Handle sklearn module path changes for backward compatibility + import sys + import sklearn.preprocessing + if 'sklearn.preprocessing.label' not in sys.modules: + sys.modules['sklearn.preprocessing.label'] = sklearn.preprocessing return joblib.load(mlb_path) mlb = MultiLabelBinarizer(sparse_output=True) mlb.fit(labels) @@ -83,7 +90,7 @@ def get_mlb(mlb_path, labels=None) -> MultiLabelBinarizer: def get_sparse_feature(feature_file, label_file): sparse_x, _ = load_svmlight_file(feature_file, multilabel=True) - return normalize(sparse_x), np.load(label_file) if label_file is not None else None + return normalize(sparse_x), np.load(label_file, allow_pickle=True) if label_file is not None else None def output_res(output_path, name, scores, labels): diff --git a/deepxml/evaluation.py b/deepxml/evaluation.py index 16e2a9a..170cf59 100644 --- a/deepxml/evaluation.py +++ b/deepxml/evaluation.py @@ -27,11 +27,12 @@ def get_mlb(classes: TClass = None, mlb: TMlb = None, targets: TTarget = None): if classes is not None: - mlb = MultiLabelBinarizer(classes, sparse_output=True) + mlb = MultiLabelBinarizer(sparse_output=True) + mlb.fit([classes]) if mlb is None and targets is not None: if isinstance(targets, csr_matrix): - mlb = MultiLabelBinarizer(range(targets.shape[1]), sparse_output=True) - mlb.fit(None) + mlb = MultiLabelBinarizer(sparse_output=True) + mlb.fit([list(range(targets.shape[1]))]) else: mlb = MultiLabelBinarizer(sparse_output=True) mlb.fit(targets) diff --git a/deepxml/models.py b/deepxml/models.py index 55e5b2a..34be32c 100644 --- a/deepxml/models.py +++ b/deepxml/models.py @@ -29,7 +29,10 @@ class Model(object): """ def __init__(self, network, model_path, gradient_clip_value=5.0, device_ids=None, **kwargs): - self.model = nn.DataParallel(network(**kwargs).cuda(), device_ids=device_ids) + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + if device_ids is None and torch.cuda.is_available(): + device_ids = list(range(torch.cuda.device_count())) + self.model = nn.DataParallel(network(**kwargs).to(self.device), device_ids=device_ids) self.loss_fn = nn.BCEWithLogitsLoss() self.model_path, self.state = model_path, {} os.makedirs(os.path.split(self.model_path)[0], exist_ok=True) @@ -64,7 +67,7 @@ def train(self, train_loader: DataLoader, valid_loader: DataLoader, opt_params: self.swa_init() for i, (train_x, train_y) in enumerate(train_loader, 1): global_step += 1 - loss = self.train_step(train_x, train_y.cuda()) + loss = self.train_step(train_x, train_y.to(self.device)) if global_step % step == 0: self.swa_step() self.swap_swa_params() @@ -99,9 +102,9 @@ def clip_gradient(self): if self.gradient_clip_value is not None: max_norm = max(self.gradient_norm_queue) total_norm = torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm * self.gradient_clip_value) - self.gradient_norm_queue.append(min(total_norm, max_norm * 2.0, 1.0)) + self.gradient_norm_queue.append(min(total_norm.item(), max_norm * 2.0, 1.0)) if total_norm > max_norm * self.gradient_clip_value: - logger.warn(F'Clipping gradients with total norm {round(total_norm, 5)} ' + logger.warn(F'Clipping gradients with total norm {round(total_norm.item(), 5)} ' F'and max norm {round(max_norm, 5)}') def swa_init(self): @@ -118,7 +121,7 @@ def swa_step(self): beta = 1.0 / swa_state['models_num'] with torch.no_grad(): for n, p in self.model.named_parameters(): - swa_state[n].mul_(1.0 - beta).add_(beta, p.data) + swa_state[n].mul_(1.0 - beta).add_(p.data, alpha=beta) def swap_swa_params(self): if 'swa' in self.state: @@ -162,7 +165,7 @@ def predict_step(self, data_x: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], self.model.eval() with torch.no_grad(): scores = torch.sigmoid(self.network(data_x, candidates=candidates, attn_weights=self.attn_weights)) - scores, labels = torch.topk(scores * group_scores.cuda(), k) + scores, labels = torch.topk(scores * group_scores.to(self.device), k) return scores.cpu(), candidates[np.arange(len(data_x)).reshape(-1, 1), labels.cpu()] def train(self, *args, **kwargs): diff --git a/deepxml/modules.py b/deepxml/modules.py index d17a99f..8b4848d 100644 --- a/deepxml/modules.py +++ b/deepxml/modules.py @@ -27,8 +27,9 @@ def __init__(self, vocab_size=None, emb_size=None, emb_init=None, emb_trainable= if emb_size is not None: assert emb_size == emb_init.shape[1] vocab_size, emb_size = emb_init.shape - self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx, sparse=True, - _weight=torch.from_numpy(emb_init).float() if emb_init is not None else None) + self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx, sparse=True) + if emb_init is not None: + self.emb.weight.data.copy_(torch.from_numpy(emb_init).float()) self.emb.weight.requires_grad = emb_trainable self.dropout = nn.Dropout(dropout) self.padding_idx = padding_idx @@ -54,7 +55,9 @@ def forward(self, inputs, lengths, **kwargs): init_state = self.init_state.repeat([1, inputs.size(0), 1]) cell_init, hidden_init = init_state[:init_state.size(0)//2], init_state[init_state.size(0)//2:] idx = torch.argsort(lengths, descending=True) - packed_inputs = nn.utils.rnn.pack_padded_sequence(inputs[idx], lengths[idx], batch_first=True) + # In PyTorch 2.x, pack_padded_sequence requires lengths to be on CPU + lengths_cpu = lengths[idx].cpu() + packed_inputs = nn.utils.rnn.pack_padded_sequence(inputs[idx], lengths_cpu, batch_first=True) outputs, _ = nn.utils.rnn.pad_packed_sequence( self.lstm(packed_inputs, (hidden_init, cell_init))[0], batch_first=True) return self.dropout(outputs[torch.argsort(idx)]) @@ -71,7 +74,7 @@ def __init__(self, labels_num, hidden_size): def forward(self, inputs, masks): masks = torch.unsqueeze(masks, 1) # N, 1, L - attention = self.attention(inputs).transpose(1, 2).masked_fill(1.0 - masks, -np.inf) # N, labels_num, L + attention = self.attention(inputs).transpose(1, 2).masked_fill(~masks, -np.inf) # N, labels_num, L attention = F.softmax(attention, -1) return attention @ inputs # N, labels_num, hidden_size @@ -82,14 +85,20 @@ class AttentionWeights(nn.Module): """ def __init__(self, labels_num, hidden_size, device_ids=None): super(AttentionWeights, self).__init__() + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device_ids is None: - device_ids = list(range(1, torch.cuda.device_count())) + device_ids = list(range(1, torch.cuda.device_count())) if torch.cuda.is_available() else [0] assert labels_num >= len(device_ids) group_size, plus_num = labels_num // len(device_ids), labels_num % len(device_ids) self.group = [group_size + 1] * plus_num + [group_size] * (len(device_ids) - plus_num) assert sum(self.group) == labels_num - self.emb = nn.ModuleList(nn.Embedding(size, hidden_size, sparse=True).cuda(device_ids[i]) - for i, size in enumerate(self.group)) + # Create embeddings on appropriate devices + if torch.cuda.is_available() and len(device_ids) > 0: + self.emb = nn.ModuleList(nn.Embedding(size, hidden_size, sparse=True).to(f'cuda:{device_ids[i]}') + for i, size in enumerate(self.group)) + else: + self.emb = nn.ModuleList(nn.Embedding(size, hidden_size, sparse=True).to(self.device) + for i, size in enumerate(self.group)) std = (6.0 / (labels_num + hidden_size)) ** 0.5 with torch.no_grad(): for emb in self.emb: @@ -119,7 +128,7 @@ def forward(self, inputs, masks, candidates, attn_weights: nn.Module): masks = torch.unsqueeze(masks, 1) # N, 1, L attn_inputs = inputs.transpose(1, 2) # N, hidden, L attn_weights = self.attention(candidates) if hasattr(self, 'attention') else attn_weights(candidates) - attention = (attn_weights @ attn_inputs).masked_fill(1.0 - masks, -np.inf) # N, sampled_size, L + attention = (attn_weights @ attn_inputs).masked_fill(~masks, -np.inf) # N, sampled_size, L attention = F.softmax(attention, -1) # N, sampled_size, L return attention @ inputs # N, sampled_size, hidden_size diff --git a/deepxml/optimizers.py b/deepxml/optimizers.py index 640c8e8..64fce62 100644 --- a/deepxml/optimizers.py +++ b/deepxml/optimizers.py @@ -102,19 +102,19 @@ def make_sparse(values): p.data.add_(make_sparse(-step_size * numer.div_(denom))) if weight_decay > 0.0: - p.data.add_(-group['lr'] * weight_decay, p.data.sparse_mask(grad)) + p.data.add_(p.data.sparse_mask(grad), alpha=-group['lr'] * weight_decay) else: # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1 ** state['step'] bias_correction2 = 1 - beta2 ** state['step'] step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - p.data.addcdiv_(-step_size, exp_avg, denom) + p.data.addcdiv_(exp_avg, denom, value=-step_size) if weight_decay > 0.0: - p.data.add_(-group['lr'] * weight_decay, p.data) + p.data.add_(p.data, alpha=-group['lr'] * weight_decay) return loss diff --git a/deepxml/tree.py b/deepxml/tree.py index 9d8d845..610a9cd 100644 --- a/deepxml/tree.py +++ b/deepxml/tree.py @@ -62,7 +62,7 @@ def train_level(self, level, train_x, train_y, valid_x, valid_y): if level == 0: while not os.path.exists(F'{self.groups_path}-Level-{level}.npy'): time.sleep(30) - groups = np.load(F'{self.groups_path}-Level-{level}.npy') + groups = np.load(F'{self.groups_path}-Level-{level}.npy', allow_pickle=True) train_y, valid_y = self.get_mapping_y(groups, self.labels_num, train_y, valid_y) labels_num = len(groups) train_loader = DataLoader(MultiLabelDataset(train_x, train_y), @@ -83,7 +83,8 @@ def train_level(self, level, train_x, train_y, valid_x, valid_y): return train_y, model.predict(train_loader, k=self.top), model.predict(valid_loader, k=self.top) else: train_group_y, train_group, valid_group = self.train_level(level - 1, train_x, train_y, valid_x, valid_y) - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() logger.info('Getting Candidates') _, group_labels = train_group @@ -112,12 +113,12 @@ def train_level(self, level, train_x, train_y, valid_x, valid_y): if level < self.level - 1: while not os.path.exists(F'{self.groups_path}-Level-{level}.npy'): time.sleep(30) - groups = np.load(F'{self.groups_path}-Level-{level}.npy') + groups = np.load(F'{self.groups_path}-Level-{level}.npy', allow_pickle=True) train_y, valid_y = self.get_mapping_y(groups, self.labels_num, train_y, valid_y) labels_num, last_groups = len(groups), self.get_inter_groups(len(groups)) else: groups, labels_num = None, train_y.shape[1] - last_groups = np.load(F'{self.groups_path}-Level-{level-1}.npy') + last_groups = np.load(F'{self.groups_path}-Level-{level-1}.npy', allow_pickle=True) train_loader = DataLoader(XMLDataset(train_x, train_y, labels_num=labels_num, groups=last_groups, group_labels=group_candidates), @@ -169,11 +170,12 @@ def predict_level(self, level, test_x, k, labels_num): return model.predict(test_loader, k=k) else: if level == self.level - 1: - groups = np.load(F'{self.groups_path}-Level-{level-1}.npy') + groups = np.load(F'{self.groups_path}-Level-{level-1}.npy', allow_pickle=True) else: groups = self.get_inter_groups(labels_num) group_scores, group_labels = self.predict_level(level - 1, test_x, self.top, len(groups)) - torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.empty_cache() logger.info(F'Predicting Level-{level}, Top: {k}') if model is None: model = XMLModel(network=FastAttentionRNN, labels_num=labels_num, diff --git a/ensemble.py b/ensemble.py index cb4fcb2..af39d4b 100644 --- a/ensemble.py +++ b/ensemble.py @@ -18,8 +18,8 @@ def main(prefix, trees): labels, scores = [], [] for i in range(trees): - labels.append(np.load(F'{prefix}-Tree-{i}-labels.npy')) - scores.append(np.load(F'{prefix}-Tree-{i}-scores.npy')) + labels.append(np.load(F'{prefix}-Tree-{i}-labels.npy', allow_pickle=True)) + scores.append(np.load(F'{prefix}-Tree-{i}-scores.npy', allow_pickle=True)) ensemble_labels, ensemble_scores = [], [] for i in tqdm(range(len(labels[0]))): s = defaultdict(float) diff --git a/evaluation.py b/evaluation.py index 35525cb..e527ca7 100644 --- a/evaluation.py +++ b/evaluation.py @@ -23,13 +23,13 @@ @click.option('-a', type=click.FLOAT, default=0.55, help='Parameter A for propensity score.') @click.option('-b', type=click.FLOAT, default=1.5, help='Parameter B for propensity score.') def main(results, targets, train_labels, a, b): - res, targets = np.load(results), np.load(targets) + res, targets = np.load(results, allow_pickle=True), np.load(targets, allow_pickle=True) mlb = MultiLabelBinarizer(sparse_output=True) targets = mlb.fit_transform(targets) print('Precision@1,3,5:', get_p_1(res, targets, mlb), get_p_3(res, targets, mlb), get_p_5(res, targets, mlb)) print('nDCG@1,3,5:', get_n_1(res, targets, mlb), get_n_3(res, targets, mlb), get_n_5(res, targets, mlb)) if train_labels is not None: - train_labels = np.load(train_labels) + train_labels = np.load(train_labels, allow_pickle=True) inv_w = get_inv_propensity(mlb.transform(train_labels), a, b) print('PSPrecision@1,3,5:', get_psp_1(res, targets, inv_w, mlb), get_psp_3(res, targets, inv_w, mlb), get_psp_5(res, targets, inv_w, mlb)) diff --git a/preprocess.py b/preprocess.py index ec1318b..90d7600 100644 --- a/preprocess.py +++ b/preprocess.py @@ -47,7 +47,7 @@ def main(text_path, tokenized_path, label_path, vocab_path, emb_path, w2v_model, vocab, emb_init = build_vocab(fp, w2v_model, vocab_size=vocab_size) np.save(vocab_path, vocab) np.save(emb_path, emb_init) - vocab = {word: i for i, word in enumerate(np.load(vocab_path))} + vocab = {word: i for i, word in enumerate(np.load(vocab_path, allow_pickle=True))} logger.info(F'Vocab Size: {len(vocab)}') logger.info(F'Getting Dataset: {text_path} Max Length: {max_len}') diff --git a/requirements.txt b/requirements.txt index a043057..7344ec5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ -click==7.0 -ruamel.yaml==0.16.5 -numpy==1.16.2 -scipy==1.3.1 -scikit-learn==0.21.2 -gensim==3.4.0 -torch==1.0.1 -nltk==3.4 -tqdm==4.31.1 -joblib==0.13.2 -logzero==1.5.0 +click>=8.0.0 +ruamel.yaml>=0.17.0 +numpy>=1.24.0 +scipy>=1.14.0 +scikit-learn>=1.3.0 +gensim>=4.3.0 +torch>=2.5.1 +nltk>=3.8.0 +tqdm>=4.65.0 +joblib>=1.3.0 +logzero>=1.7.0