Source code for listb.mrtools

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Python functions for accessing search results from MathSciNet and
downloading BibTeX bibliographies associated to the results
"""

import re
import requests
import yaml

from bs4 import BeautifulSoup

[docs]def yaml_dump(data, path):
    """ Dumps data into yaml file at `path`

    Args:
        data (Dict[Any], etc.): data to be dumped
        path (str):             path to yaml file
    """
    with open(path, 'w') as fout:
        yaml_dumps(data, fout)

[docs]def yaml_dumps(data, handle):
    """ Dumps data into handle

    Args:
        data (Dict[Any], etc.): data to be dumped
        handle (handle):        handle the data should be dumped into
    """
    yaml.dump(data, handle,
              default_flow_style=False,
              allow_unicode=True)

[docs]def get_mrnumber(doc):
    """ Extracts MR-number from the "headlineText" of the search result

    Args:
        doc (bs4.element.Tag):  headlineText

    Returns:
        str:    MR-number

    Attributes:
        PAT (_sre.SRE_Pattern):
            precompiled pattern for extracting the MR-number
    """
    mrnumber = doc.find(class_='mrnum').strong.string
    grp = get_mrnumber.PAT.match(mrnumber)
    return grp[1]
get_mrnumber.PAT = re.compile(r'MR(\d+)', re.IGNORECASE)

[docs]def msn_to_mrnumbers(msn, outfile=None):
    """ Retrieves MR-numbers from the source code of a search page

    Args:
        msn (str OR file handle):   source code of the search result
        outfile Optional[str]:
            if specified the MR-numbers get written to a yaml file located at
            the path

    Returns:
        List[str]:  List of MR-numbers found on page

    Example:
        >>> msn = '''<div class="headlineText">
        ...            <a class="mrnum" title="Full MathSciNet Item"
        ...             href="[...]"><strong>MR3549381</strong></a>
        ...          </div>'''
        >>> msn_to_mrnumbers(msn)
        ['3549381']
    """
    msn_soup = BeautifulSoup(msn, 'html.parser')

    docs = msn_soup.find_all('div', class_='headlineText')
    mrnumbers = [get_mrnumber(doc) for doc in docs]

    if outfile:
        yaml_dump(mrnumbers, outfile)
    return mrnumbers

[docs]def get_bibtex_from_msn(mrnumbers, outfile=None):
    """ Fetches BibTeX file from MathSciNet server using the MR-numbers

    Args:
        mrnumbers (List[str]):
            the BibTeX entries for these MR-numbers are retrieved
        outfile (Opitonal[str]):
            path to output file

    Returns:
        str:    BibTeX file as string

    Note:
        To use this fuction you need to have access to MathSciNet.

    Example:
        >>> print(get_bibtex_from_msn(['0241312'])) # doctest: +SKIP
        @article {MR0241312,
            AUTHOR = {Shelah, Saharon},
             TITLE = {Note on a min-max problem of {L}eo {M}oser},
           JOURNAL = {J. Combinatorial Theory},
            VOLUME = {6},
              YEAR = {1969},
             PAGES = {298--300},
           MRCLASS = {05.04},
          MRNUMBER = {0241312},
        MRREVIEWER = {G. F. Clements},
        }
    """
    params = dict(
        bdl="",
        batch_title="Selected+Matches+for%3A+Author%3D%28Shelah%29",
        pg7="ALLF",
        yrop="eq",
        s8="All",
        pg4="AUCN",
        co7="AND",
        co5="AND",
        s6="",
        s5="",
        co4="AND",
        pg5="TI",
        co6="AND",
        pg6="PC",
        s4="Shelah",
        dr="all",
        arg3="",
        yearRangeFirst="",
        pg8="ET",
        s7="",
        review_format="html",
        yearRangeSecond="",
        fmt="bibtex",
        sort="newest",
        searchin="",
        agg_itemtype_Reviewed="Reviewed",
        agg_author_160185="160185"
    )
    params['b'] = mrnumbers
    req = requests.get('http://www.ams.org/mathscinet/search/publications.html',
                       params=params)
    dirty_bib = req.text
    soup = BeautifulSoup(dirty_bib, 'html.parser')
    pre_bib = soup.find('div', class_='doc')
    if not pre_bib:
        return

    entries = pre_bib.find_all('pre')
    bib = '\n'.join([str(e.string) for e in entries])

    if outfile:
        with open(outfile, 'w') as msn_bib:
            msn_bib.write(bib)

    return bib

[docs]def crawl(url):
    """ Crawls specified URL on MathSciNet

    If the search result is split into 5 pages and the URL to page
    3 is passed then the source codes and URLs of pages 3, 4, and 5
    are returned.

    Args:
        url (str):  URL pointing to a search page on MathSciNet

    Returns:
        (List[str], List[str]): List of page source codes and list of URLs

    Note:
        To use this fuction you need to have access to MathSciNet.
    """
    sites = []
    urls = [url]
    while True:
        req = requests.get(url)
        site = req.text
        sites.append(site)
        soup = BeautifulSoup(site, 'html.parser')
        a = soup.find('a', string='Next')
        if not a:
            break
        urls.append(a['href'])
        # Links on MathSciNet are relative
        url = 'http://www.ams.org/%s' % a['href']
    return sites, urls

if __name__ == '__main__':
    import doctest
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)