Source code for pyroxy.controllers.simple

# Copyright 2011 Andrew McFague
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os.path

import bottle
import lxml.html

from bottle import static_file
from pyroxy import app, config


log = logging.getLogger(__name__)


[docs]def asbool(val):
    """
    Attempts to coerce :data:`val` to a boolean value.  This is largely based
    off of :func:`paste.deploy.converters.asbool` with a few stylistic tweaks
    added.

    :raises:
        :exc:`ValueError` if the value could not be coerced to a :const:`True`
        or :const:`False` value.
    :returns:
        :const:`True` or :const:`False`, depending on what :data:`val` was
        coerced to.
    """
    # If its not a string or string-like object, just do our best.
    if not isinstance(val, basestring):
        return bool(val)

    obj = val.strip().lower()
    if obj in ['true', 'yes', 'on', 'y', 't', '1']:
        return True
    elif obj in ['false', 'no', 'off', 'n', 'f', '0']:
        return False
    else:
        raise ValueError("Could not coerce `%s` to True/False" % obj)


[docs]def pred_filter_internal_download_links(package_name, href, title):
    """
    Predicate used to filter internal download links.  Currently, the only
    config value is ``allowed_extensions``.

    :param string package_name:
        Name of the Python package to retrieve and filter the simple page for.
    :param href:
        Relative URL generated from the simple page (i.e.,
        `../../packages/a/b/c/pyroxy-0.1.tar.gz`)
    :param title:
        Text component of the link from the simple page.  This can include
        suffixes such as `download_url` and `home_page`, which can be used as
        part of the filter.
    :return:
        :const:`True` if the internal download link should be included in the
        output, :const:`False` otherwise.
    """
    allowed_extensions = config.get_package_config(
            package_name, 'allowed_extensions')
    if allowed_extensions is None:
        return True

    _, _, extension = title.rpartition('.')
    return extension in allowed_extensions


[docs]def pred_filter_home_pages(package_name, href, title):
    """
    Predicate used to filter home pages.

    :param string package_name:
        Name of the Python package to retrieve and filter the simple page for.
    :param href:
        Absolute URL to a `home page` (i.e., a third party site)
    :param title:
        Text component of the link from the simple page.  This can include
        suffixes such as `download_url` and `home_page`, which can be used as
        part of the filter.
    :returns:
        :const:`True` if the home page link should be included in the output,
        :const:`False` otherwise.
    """
    return href.startswith("http") and "home_page" in title


[docs]def pred_filter_external_download_links(package_name, href, title):
    """
    Predicate used to filter external download links.  Currently, the only
    config value is ``allowed_extensions``.

    :param string package_name:
        Name of the Python package to retrieve and filter the simple page for.
    :param href:
        Absolute URL pointing to a downloadable file on a third-party web site.
    :param title:
        Text component of the link from the simple page.  This can include
        suffixes such as `download_url` and `home_page`, which can be used as
        part of the filter.
    :returns:
        :const:`True` if the external download link should be included in the
        output, :const:`False` otherwise.
    """
    return "download_url" in title


[docs]def filter_index(index_path):
    """
    Path to an index page (index.html, typically).  This function will parse the
    results into an lxml tree, call :func:`remove_links` on the resulting tree,
    and then convert it back to an HTML page.

    :param index_path:
        Absolute path to the index file.  This MUST already be created, and must
        be a file.
    :raises:
        If the file could not be opened, a 404 exception will be raised.
    :returns:
        An lxml formatted HTML page as a string, containing the final, filtered
        version of the page.
    """
    try:
        fd = open(index_path, "r")
    except IOError:
        log.exception("Could not open %s for filtering.", index_path)
        bottle.abort(404)

    html_tree = lxml.html.parse(fd)
    html_tree = remove_links(html_tree)
    return lxml.html.tostring(html_tree)


[docs]def split_links(package_name, elements):
    """
    Parses through a list of elements, and splits them up based on their type.

    :param string package_name:
        Name of the Python package to retrieve and filter the simple page for.
    :param iterable elements:
        iterable containing :class:`lxml.etree.ElementTree.Element`
    :returns:
        Tuple of lists, ``(external_download_links, home_pages,
        internal_download_links, unknown_links)``.
    """
    external_download_links = []
    home_pages = []
    internal_download_links = []
    unknown_links = []

    for element in elements:
        href = element.get("href")
        title = element.text_content()

        if pred_filter_internal_download_links(package_name, href, title):
            internal_download_links.append(element)
        elif pred_filter_home_pages(package_name, href, title):
            home_pages.append(element)
        elif pred_filter_external_download_links(package_name, href, title):
            external_download_links.append(element)
        else:
            unknown_links.append(element)

    return (external_download_links, home_pages, internal_download_links,
            unknown_links)


[docs]def build_links_to_remove(external_download_links, home_pages,
        internal_download_links, unknown_links):
    """
    Generates a list of links to remove based on which links are preferred.

    :param list external_download_links:
        list of internal download links
    :param list home_pages:
        list of home pages
    :param list internal_download_links:
        list of internal download links
    :param list unknown_links:
        list of unknown links, or links that do not match either any of the
        previous three.
    :returns:
        list of :class:`lxml.etree.ElementTree.Element` that are safe to be
        removed.
    """
    if internal_download_links:
        to_be_removed = external_download_links + home_pages + unknown_links
    elif external_download_links:
        to_be_removed = home_pages + unknown_links
    elif home_pages:
        to_be_removed = unknown_links
    else:
        to_be_removed = []

    return to_be_removed


[docs]def remove_links(package_name, html_tree):
    """
    Filters out links based on the various predicates.  Unfortunately, right
    now, the predicates aren't configurable.

    :param string package_name:
        Name of the Python package to retrieve and filter the simple page for.
    :param html_tree:
        :mod:`lxml.tree` representing the simple page.
    :returns:
        A modified :mod:`lxml.tree` with specific links filtered out.
    """
    (external_download_links, home_pages, internal_download_links,
            unknown_links) = split_links(package_name,
                    html_tree.iterfind(".//a"))
    to_be_removed = build_links_to_remove(external_download_links, home_pages,
        internal_download_links, unknown_links)

    # So that we preserve the funky formatting of the page, lets make sure that
    # we just pull out the tags in place, without disrupting the HTML page
    # itself.
    for element in to_be_removed:
        element.getnext().drop_tree()
        element.drop_tree()

    return html_tree


@app.route("/simple/<package_name>/")
@app.route("/simple/<package_name>/index.html")
[docs]def package_list(package_name):
    """
    Controller used to serve up a package listing.

    :param string package_name:
        Name of the Python package to retrieve and filter the simple page for.
        Capitalization matters, depending on your operating system--i.e., on
        Linux, `pylons` != `Pylons`.
    :returns:
        The raw simple page if the package was whitelisted, or a filtered page
        if it was not.
    """
    package_index_path = os.path.join(
        config['pypi_web_path'], "simple", package_name, "index.html")
    if package_name.lower() in config.get('whitelisted_packages', []):
        root = os.path.join(config['pypi_web_path'], "simple", package_name)
        return static_file("index.html", root=root)
    else:
        return filter_index(package_index_path)
Navigation

Source code for pyroxy.controllers.simple

Project Versions

RTD Search

Quick search

Navigation