citation-fetcher/citation_fetcher.py at main · Hhhpraise/citation-fetcher · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import requests
import time
import re
import json
from urllib.parse import quote
import xml.etree.ElementTree as ET


def clean_title(title):
    """Clean paper title for searching while preserving Unicode characters"""
    # Remove problematic characters but keep Unicode letters
    cleaned = re.sub(r'[^\w\s\-:.,;?!\'"()]', '', title, flags=re.UNICODE)
    return cleaned.strip()


def fetch_bibtex_from_crossref(paper_title):
    """Fetch BibTeX citation from Crossref API with proper Unicode handling"""
    base_url = "https://api.crossref.org/works"
    params = {
        'query.bibliographic': clean_title(paper_title),
        'rows': 1,
        'mailto': 'user@example.com'  # Required by Crossref API for polite usage
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json; charset=utf-8',
        'Accept-Charset': 'utf-8'
    }

    try:
        response = requests.get(base_url, params=params, headers=headers, timeout=10)
        response.raise_for_status()

        # Ensure proper UTF-8 decoding
        response.encoding = 'utf-8'
        data = response.json()

        if data['message']['items']:
            work = data['message']['items'][0]
            doi = work.get('DOI')

            if doi:
                # Fetch BibTeX format
                bibtex_url = f"https://api.crossref.org/works/{doi}/transform/application/x-bibtex"
                bib_response = requests.get(bibtex_url, headers=headers, timeout=10)
                bib_response.raise_for_status()
                bib_response.encoding = 'utf-8'
                return bib_response.text

        return None

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for '{paper_title[:50]}...': {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON decoding error for '{paper_title[:50]}...': {e}")
        return None


def fetch_bibtex_from_arxiv(paper_title):
    """Alternative: Fetch BibTeX from ArXiv if Crossref fails"""
    # Clean title for ArXiv search
    clean_title_search = re.sub(r'[^\w\s\-]', ' ', paper_title, flags=re.UNICODE)
    clean_title_search = re.sub(r'\s+', ' ', clean_title_search).strip()

    base_url = "http://export.arxiv.org/api/query"
    params = {
        'search_query': f'ti:"{clean_title_search}"',
        'max_results': 1,
        'sortBy': 'relevance',
        'sortOrder': 'descending'
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/xml; charset=utf-8'
    }

    try:
        response = requests.get(base_url, params=params, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = 'utf-8'

        # Parse XML response
        root = ET.fromstring(response.content)

        # Define namespaces for ArXiv Atom feed
        ns = {'atom': 'http://www.w3.org/2005/Atom'}

        entries = root.findall('atom:entry', ns)
        if entries:
            entry = entries[0]

            # Extract title
            title_elem = entry.find('atom:title', ns)
            title = title_elem.text.strip() if title_elem is not None and title_elem.text else ''

            # Extract authors
            authors = []
            author_elems = entry.findall('atom:author/atom:name', ns)
            for author_elem in author_elems:
                if author_elem.text:
                    authors.append(author_elem.text.strip())

            # Extract publication date
            published_elem = entry.find('atom:published', ns)
            year = '2023'  # Default
            if published_elem is not None and published_elem.text:
                date_str = published_elem.text.strip()
                year = date_str[:4]

            # Extract arXiv ID for unique key
            id_elem = entry.find('atom:id', ns)
            arxiv_id = ''
            if id_elem is not None and id_elem.text:
                # Extract arXiv ID from URL
                match = re.search(r'arxiv\.org/abs/(.+)', id_elem.text)
                if match:
                    arxiv_id = match.group(1).replace('/', '')

            if title and authors:
                # Generate BibTeX key
                if arxiv_id:
                    bib_key = f"arxiv:{arxiv_id}"
                else:
                    # Fallback: use first author and year
                    first_author = authors[0].split()[-1].lower() if authors else 'unknown'
                    first_author = re.sub(r'[^\w]', '', first_author)  # Clean author name
                    bib_key = f"{first_author}{year}"

                # Escape special BibTeX characters in title - FIXED: Use raw strings or double backslashes
                title_escaped = title.replace('{', '\\{').replace('}', '\\}')

                bibtex = f"""@article{{{bib_key},
  title = {{{title_escaped}}},
  author = {{{' and '.join(authors)}}},
  year = {{{year}}},
  journal = {{arXiv preprint}},
  note = {{arXiv:{arxiv_id if arxiv_id else 'preprint'}}},
  url = {{https://arxiv.org/abs/{arxiv_id if arxiv_id else ''}}}
}}"""
                return bibtex

        return None

    except requests.exceptions.RequestException as e:
        print(f"Error fetching from ArXiv for '{paper_title[:50]}...': {e}")
        return None
    except ET.ParseError as e:
        print(f"XML parsing error for '{paper_title[:50]}...': {e}")
        return None


def save_bibtex_to_file(paper_titles, output_file="citations.bib", delay=1):
    """
    Main function to fetch and save BibTeX citations with proper Unicode handling

    Args:
        paper_titles (list): List of paper titles to search for
        output_file (str): Output file name
        delay (int): Delay between requests in seconds (to be polite to APIs)
    """

    successful = 0
    failed = []

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("% Auto-generated BibTeX citations\n")
        f.write("% Generated using academic APIs\n")
        f.write("% Encoding: UTF-8\n\n")

        for i, title in enumerate(paper_titles, 1):
            print(f"Processing ({i}/{len(paper_titles)}): {title[:80]}...")

            # Try Crossref first
            bibtex = fetch_bibtex_from_crossref(title)

            # If Crossref fails, try ArXiv
            if not bibtex:
                bibtex = fetch_bibtex_from_arxiv(title)

            if bibtex:
                # Ensure bibtex is properly encoded
                f.write(bibtex)
                f.write("\n\n")
                successful += 1
                print(f"✓ Successfully fetched citation")
            else:
                failed.append(title)
                print(f"✗ Failed to fetch citation")

                # Add placeholder entry for missing citations
                safe_key = re.sub(r'[^\w]', '_', title[:50].lower())
                placeholder = f"""@article{{{safe_key},
  title = {{{title}}},
  author = {{Unknown}},
  year = {{}},
  note = {{Citation not found, please add manually}},
  url = {{}}
}}
"""
                f.write(placeholder)
                f.write("\n\n")

            # Be polite to APIs
            time.sleep(delay)

    # Print summary
    print(f"\n{'=' * 50}")
    print(f"SUMMARY")
    print(f"{'=' * 50}")
    print(f"Successful: {successful}/{len(paper_titles)}")
    print(f"Failed: {len(failed)}")

    if failed:
        print(f"\nFailed papers:")
        for i, paper in enumerate(failed, 1):
            print(f"  {i}. {paper[:100]}...")


def read_titles_from_file(filename):
    """Read paper titles from a text file (one per line) with proper Unicode handling"""
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            titles = [line.strip() for line in f if line.strip()]
        return titles
    except UnicodeDecodeError:
        # Try different encodings if UTF-8 fails
        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        for encoding in encodings:
            try:
                with open(filename, 'r', encoding=encoding) as f:
                    titles = [line.strip() for line in f if line.strip()]
                print(f"Successfully read file with {encoding} encoding")
                return titles
            except UnicodeDecodeError:
                continue
        print(f"Error: Could not read {filename} with any supported encoding")
        return []


def generate_bibtex_manually(paper_title):
    """Generate a simple BibTeX entry manually when APIs fail"""
    # Create a safe key from the title
    safe_key = re.sub(r'[^\w]', '_', paper_title[:50].lower())

    bibtex = f"""@article{{{safe_key},
  title = {{{paper_title}}},
  author = {{Unknown}},
  year = {{}},
  journal = {{}},
  note = {{Citation not found automatically, please fill in manually}},
  url = {{}}
}}"""
    return bibtex


# Example usage
if __name__ == "__main__":
    # Method 1: Direct list of papers
    # papers = [
    #     "Attention Is All You Need",
    #     "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
    #     "Improving Language Understanding by Generative Pre-Training"
    # ]

    # Method 2: Read from file
    papers = read_titles_from_file("papers.txt")

    if not papers:
        print("Error: No papers found. Please check your input file or list.")
        print("Creating example file 'papers.txt' with sample papers...")
        with open("papers.txt", "w", encoding="utf-8") as f:
            f.write("Attention Is All You Need\n")
            f.write("BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\n")
            f.write("Improving Language Understanding by Generative Pre-Training\n")
        papers = read_titles_from_file("papers.txt")

    # Fetch and save citations
    print(f"Processing {len(papers)} papers...")
    save_bibtex_to_file(papers, "my_citations.bib", delay=1.5)

    print(f"\nCitations saved to 'my_citations.bib'")