abstracts/generator/evaluate.py at main · guillemgt/abstracts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
'''
    This file deals with the abstract generation.
    It outputs a few results to terminal or many to a file (or more).
'''

import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

import os
import random
import re
import json


# Loading the model

load_saved_model = True
if load_saved_model:
  tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")
  model = GPTNeoForCausalLM.from_pretrained("model").cuda()
  print("Loaded saved model")
else:
  model_name = "EleutherAI/gpt-neo-125M"

  tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='<SENTENCE>', eos_token='</SENTENCE>', pad_token='<PAD/>')
  model = GPTNeoForCausalLM.from_pretrained(model_name).cuda()

  model.resize_token_embeddings(len(tokenizer))
  print("Loaded new model")

model.eval()


# Evaluate

SAVE_TO_FILE = False
RESULTS_PATH = "results/"

if SAVE_TO_FILE:

  if not os.path.exists(RESULTS_PATH):
      os.makedirs(RESULTS_PATH)

  MAX_CATEGORIES = 3
  CATEGORIES = ["math.AC", "math.AG", "math.AP", "math.AT", "math.CA", "math.CO", "math.CT", "math.CV", "math.DG", "math.DS", "math.FA", "math.GM", "math.GN", "math.GR", "math.HO", "math.IT", "math.KT", "math.LO", "math.MG", "math.MP", "math.NT", "math.OA", "math.OC", "math.PR", "math.QA", "math.RA", "math.RT", "math.SG", "math.SP", "math.ST"]
  NUM = 1000
  NUM_PER_FILE = 1000
  FILES = NUM // NUM_PER_FILE

  for file_num in range(FILES):

    result = []
    for i in range(0, NUM_PER_FILE):
      l = random.randint(1, MAX_CATEGORIES)
      categories = " ".join(random.sample(CATEGORIES, l))

      prompt = f'<SENTENCE>Categories: {categories}\nTitle:'
      generated = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
      sample_outputs = model.generate(generated, do_sample=True, top_p=0.96, max_length=512, temperature=0.65, num_beams=1, repetition_penalty=1.15, no_repeat_ngram_size=6, num_return_sequences=1)

      predicted_texts = tokenizer.batch_decode(sample_outputs, skip_special_tokens=False)
      for predicted_text in predicted_texts:
        clean_text = predicted_text.split("</SENTENCE>")[0].split("Categories: ")[1]
        splits = re.split('(?:\nTitle: )|(?:\nAbstract: )', clean_text)
        if len(splits) == 3:
            result.append({
              "categories": splits[0].split(),
                "title": splits[1].strip(),
                "abstract": splits[2].strip()
            })
        print("[{} {}]".format(file_num, i), clean_text + "\n")

    with open(os.path.join(RESULTS_PATH, "results_" + str(file_num) + ".json"), "w") as of:
      of.write(json.dumps(result))
      of.close()

else:

  torch.manual_seed(100)
  prompt = f'<SENTENCE>Categories: math.NT math.RT\nTitle:'
  generated = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
  sample_outputs = model.generate(generated, do_sample=True, top_p=0.96, max_length=512, temperature=0.65, num_beams=1, repetition_penalty=1.15, no_repeat_ngram_size=6, num_return_sequences=4)
  predicted_texts = tokenizer.batch_decode(sample_outputs, skip_special_tokens=False)
  for predicted_text in predicted_texts:
    clean_text = predicted_text.split("</SENTENCE>")[0].split("Categories: ")[1]
    print(clean_text + "\n")