Source code for listb.mrtools

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Python functions for accessing search results from MathSciNet and
downloading BibTeX bibliographies associated to the results
"""

import re
import requests
import yaml

from bs4 import BeautifulSoup

[docs]def yaml_dump(data, path): """ Dumps data into yaml file at `path` Args: data (Dict[Any], etc.): data to be dumped path (str): path to yaml file """ with open(path, 'w') as fout: yaml_dumps(data, fout)
[docs]def yaml_dumps(data, handle): """ Dumps data into handle Args: data (Dict[Any], etc.): data to be dumped handle (handle): handle the data should be dumped into """ yaml.dump(data, handle, default_flow_style=False, allow_unicode=True)
[docs]def get_mrnumber(doc): """ Extracts MR-number from the "headlineText" of the search result Args: doc (bs4.element.Tag): headlineText Returns: str: MR-number Attributes: PAT (_sre.SRE_Pattern): precompiled pattern for extracting the MR-number """ mrnumber = doc.find(class_='mrnum').strong.string grp = get_mrnumber.PAT.match(mrnumber) return grp[1]
get_mrnumber.PAT = re.compile(r'MR(\d+)', re.IGNORECASE)
[docs]def msn_to_mrnumbers(msn, outfile=None): """ Retrieves MR-numbers from the source code of a search page Args: msn (str OR file handle): source code of the search result outfile Optional[str]: if specified the MR-numbers get written to a yaml file located at the path Returns: List[str]: List of MR-numbers found on page Example: >>> msn = '''<div class="headlineText"> ... <a class="mrnum" title="Full MathSciNet Item" ... href="[...]"><strong>MR3549381</strong></a> ... </div>''' >>> msn_to_mrnumbers(msn) ['3549381'] """ msn_soup = BeautifulSoup(msn, 'html.parser') docs = msn_soup.find_all('div', class_='headlineText') mrnumbers = [get_mrnumber(doc) for doc in docs] if outfile: yaml_dump(mrnumbers, outfile) return mrnumbers
[docs]def get_bibtex_from_msn(mrnumbers, outfile=None): """ Fetches BibTeX file from MathSciNet server using the MR-numbers Args: mrnumbers (List[str]): the BibTeX entries for these MR-numbers are retrieved outfile (Opitonal[str]): path to output file Returns: str: BibTeX file as string Note: To use this fuction you need to have access to MathSciNet. Example: >>> print(get_bibtex_from_msn(['0241312'])) # doctest: +SKIP @article {MR0241312, AUTHOR = {Shelah, Saharon}, TITLE = {Note on a min-max problem of {L}eo {M}oser}, JOURNAL = {J. Combinatorial Theory}, VOLUME = {6}, YEAR = {1969}, PAGES = {298--300}, MRCLASS = {05.04}, MRNUMBER = {0241312}, MRREVIEWER = {G. F. Clements}, } """ params = dict( bdl="", batch_title="Selected+Matches+for%3A+Author%3D%28Shelah%29", pg7="ALLF", yrop="eq", s8="All", pg4="AUCN", co7="AND", co5="AND", s6="", s5="", co4="AND", pg5="TI", co6="AND", pg6="PC", s4="Shelah", dr="all", arg3="", yearRangeFirst="", pg8="ET", s7="", review_format="html", yearRangeSecond="", fmt="bibtex", sort="newest", searchin="", agg_itemtype_Reviewed="Reviewed", agg_author_160185="160185" ) params['b'] = mrnumbers req = requests.get('http://www.ams.org/mathscinet/search/publications.html', params=params) dirty_bib = req.text soup = BeautifulSoup(dirty_bib, 'html.parser') pre_bib = soup.find('div', class_='doc') if not pre_bib: return entries = pre_bib.find_all('pre') bib = '\n'.join([str(e.string) for e in entries]) if outfile: with open(outfile, 'w') as msn_bib: msn_bib.write(bib) return bib
[docs]def crawl(url): """ Crawls specified URL on MathSciNet If the search result is split into 5 pages and the URL to page 3 is passed then the source codes and URLs of pages 3, 4, and 5 are returned. Args: url (str): URL pointing to a search page on MathSciNet Returns: (List[str], List[str]): List of page source codes and list of URLs Note: To use this fuction you need to have access to MathSciNet. """ sites = [] urls = [url] while True: req = requests.get(url) site = req.text sites.append(site) soup = BeautifulSoup(site, 'html.parser') a = soup.find('a', string='Next') if not a: break urls.append(a['href']) # Links on MathSciNet are relative url = 'http://www.ams.org/%s' % a['href'] return sites, urls
if __name__ == '__main__': import doctest doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)