Code source de elgeopaso.jobs.crawlers.georezo_rss_parser

#! python3  # noqa: E265

"""
    Name:         GeoRezo Jobs RSS Parser
    Purpose:      Parse GeoRezo RSS
    Python:       3.7+
"""

# ##############################################################################
# ########## Libraries #############
# ##################################

# Standard library
import json
import logging
from datetime import datetime
from pathlib import Path
from urllib.parse import parse_qs, urlparse

# 3rd party modules
import feedparser

# ############################################################################
# ########## GLOBALS #############
# ################################

# Feed base URL
FEEDPARSER_DOC_BASE_URL = "https://pythonhosted.org/feedparser/"

# ############################################################################
# ########## Classes #############
# ################################


[docs]class GeorezoRssParser:
    """Handy module to parse GeoRezo job offers through RSS.

    :param str feed_base_url: URL to the feed. Defaults to: "https://georezo.net/extern.php?fid=10" - optional
    :param str feed_length_param: name of the URL parameter to specifiy the number of items. Defaults to: "show" - optional
    :param int items_to_parse: number of items to request to the feed. Defaults to: 50 - optional
    :param str user_agent: HTTP user-agent. Defaults to: "ElGeoPaso/DEV +https://elgeopaso.georezo.net/" - optional
    """

    # Attributes

    # Feed datetime structure
    # see: https://docs.python.org/fr/3/library/datetime.html#strftime-and-strptime-format-codes
    FEED_DATETIME_RAW_FORMAT = "%a, %d %b %Y %H:%M:%S %z"
    FEED_DATETIME_RAW_FORMAT_ARROW = "ddd, D MMM YYYY HH:mm:ss Z"

    # File to store the feed metadata
    CRAWLER_LATEST_METADATA = "crawler_georezo_rss_latest.json"

    def __init__(
        self,
        feed_base_url: str = "https://georezo.net/extern.php?fid=10",
        feed_length_param: str = "show",
        items_to_parse: int = 50,
        user_agent: str = "ElGeoPaso/DEV +https://elgeopaso.georezo.net/",
    ):
        """Instanciate the class."""
        # store parameters as attributes
        self.feed_base_url = feed_base_url
        self.feed_length_param = feed_length_param
        self.items_to_parse = items_to_parse

        # set user agent to global feedparser
        # see: https://pythonhosted.org/feedparser/http-useragent.html
        feedparser.USER_AGENT = user_agent

    def _build_feed_url(self) -> str:
        """Build RSS feed URL from class attributes.

        :return: RSS feed URL with parameters
        :rtype: str
        """
        if self.feed_length_param and self.items_to_parse:
            complete_feed_url = "{}&{}={}".format(
                self.feed_base_url, self.feed_length_param, self.items_to_parse
            )
        else:
            complete_feed_url = self.feed_base_url

        logging.debug("Feed URL built: {}".format(complete_feed_url))
        return complete_feed_url

[docs]    @classmethod
    def extract_offer_id_from_url(cls, in_url: str) -> int:
        """Parse input URL to extract RSS item ID = job offer ID.

        :param str in_url: input URL as string. In GeoRezo RSS, it's:
          - in raw XML: '<guid isPermaLink="true">https://georezo.net/forum/viewtopic.php?pid=331081#p331081</guid>'
          - parsed by feedparser: entry.id = 'https://georezo.net/forum/viewtopic.php?pid=331144#p331144'

        :return: offer ID
        :rtype: int
        """
        parsed_url = urlparse(in_url)
        parsed_query = parse_qs(parsed_url.query)

        extracted_offer_id = parsed_query.get("pid")
        logging.debug(
            "Offer ID extracted: {} from URL '{}'".format(extracted_offer_id, in_url)
        )

        # keep only digits
        extracted_offer_id = "".join(i for i in extracted_offer_id if i.isdigit())

        return int(extracted_offer_id)

[docs]    @classmethod
    def load_previous_crawler_metadata(
        cls, from_source: str = "./last_id_georezo.txt"
    ) -> dict:
        """Retrieve last parsed item ID from specified source.

        :param str from_source: where to load the ID. Defaults to: "./last_id_georezo.txt"

        :raises NotImplementedError: [description]
        :raises ValueError: [description]

        :return: dictionary with previous crawler execution metadata
        :rtype: dict
        """
        in_source = Path(from_source)
        if in_source.exists() and in_source.suffix == ".json":
            logging.info(
                "Reading last parsed item ID from file: {}".format(from_source)
            )
            with in_source.open("r") as in_json:
                out_dict = json.load(in_json)
        else:
            logging.warning(
                "File with the latest ID offer is missing: {}. "
                "Considering latest ID = 0 and updated_parsed = None.".format(
                    in_source.resolve()
                )
            )
            out_dict = {"latest_offer_id": 0, "feed_updated_parsed": None}

        return out_dict

[docs]    def save_parsing_metadata(
        self, feed_parsed: feedparser.FeedParserDict, save_type: str = "json"
    ) -> dict:
        """Dumps some metadata from parsed feed to track behavior and enforce future \
            usage into a structured JSON file.

        :param feedparser.FeedParserDict feed_parsed: parsed feed
        :param str save_type: type of save to perform. Defaults to: "json" - optional

        :return: dictionary of saved data
        :rtype: dict

        :example:

        .. code-block:: json

            [
                {
                    "encoding": "ISO-8859-1",
                    "entries_required": 50,
                    "entries_total": 50,
                    "feed_updated_converted": "2020-03-10 13:07:06+01:00",
                    "feed_updated_parsed": [
                        2020,
                        3,
                        10,
                        12,
                        7,
                        6,
                        1,
                        70,
                        0
                    ],
                    "feed_updated_raw": "Tue, 10 Mar 2020 13:07:06 +0100",
                    "latest_offer_id": 331132,
                    "status": 200,
                    "version": "rss20"
                }
            ]
        """
        # extract last job offer id
        if len(feed_parsed.entries):
            last_job_offer_id = self.extract_offer_id_from_url(
                feed_parsed.entries[0].id
            )
        else:
            logging.warning("Unable to retrive latest job offer ID")
            last_job_offer_id = 0

        # convert datetime to str
        try:
            feed_build_dt = datetime.strptime(
                feed_parsed.feed.updated,
                self.FEED_DATETIME_RAW_FORMAT,
            )
        except Exception as err:
            logging.error(
                "Feed date '{}' can be parsed with format: {}. Error: {}".format(
                    feed_parsed.feed.updated, self.FEED_DATETIME_RAW_FORMAT, err
                )
            )
            # fallback value
            feed_build_dt = None

        # dump data
        if save_type == "json":
            data_to_save = {
                "feed_updated_raw": feed_parsed.feed.updated,
                "feed_updated_converted": str(feed_build_dt),
                "feed_updated_parsed": feed_parsed.feed.updated_parsed,
                "entries_required": self.items_to_parse,
                "entries_total": len(feed_parsed.entries),
                "encoding": feed_parsed.encoding,
                "latest_offer_id": last_job_offer_id,
                "status": feed_parsed.get("status"),
                "version": feed_parsed.version,
            }

            json_dest = Path(self.CRAWLER_LATEST_METADATA)
            with json_dest.open("w") as json_file:
                json.dump(data_to_save, json_file, indent=2, sort_keys=True)

            return data_to_save

[docs]    def parse_new_offers(
        self, ignore_encoding_errors: bool = True, only_new_offers: bool = True
    ) -> list:
        """Parse RSS feed, handle errors and filter on new offers.

        :param bool ignore_encoding_errors: option to ignore encoding exceptions. Defaults to: True
        :param bool only_new_offers: option to return only new offers basing on the \
            previous crawler execution. If False, all of the feed items will be returned.
            Defaults to: True

        :return: list with offers whose identifier is superior to the latest parsed
        :rtype: list
        """
        # retrieve informations from previous crawler run
        previous_metadata = (
            self.load_previous_crawler_metadata(self.CRAWLER_LATEST_METADATA) or 0
        )
        last_id = previous_metadata.get("latest_offer_id")

        # list to store offers IDs
        li_new_job_offers_id = []

        # RSS parser
        logging.info(
            "Connecting to the RSS. Expecting {} entries as specified in settings.".format(
                self.items_to_parse
            )
        )
        feed = feedparser.parse(
            url_file_stream_or_string=self._build_feed_url(),
            modified=previous_metadata.get("feed_updated_parsed"),
        )

        # test if feed is well-formed
        # https://pythonhosted.org/feedparser/bozo.html#bozo-detection
        if feed.bozo:
            logging.warning("Parser raised a non blocking error. Investigating...")
            if isinstance(feed.bozo_exception, feedparser.CharacterEncodingOverride):
                feedparser_related_doc = "{}character-encoding.html".format(
                    FEEDPARSER_DOC_BASE_URL
                )
                logging.error(
                    "Feed encoding is badly declared. It could be parsed but with errors."
                    " Parser error: {}."
                    " See: {}".format(feed.bozo_exception, feedparser_related_doc)
                )
                if not ignore_encoding_errors:
                    # then return empty list
                    return li_new_job_offers_id
            elif isinstance(feed.bozo_exception, feedparser.CharacterEncodingUnknown):
                feedparser_related_doc = "{}character-encoding.html".format(
                    FEEDPARSER_DOC_BASE_URL
                )
                logging.error(
                    "Feed encoding could not be identified. "
                    "Parsing result is likely to be unpredictable..."
                    " Parser error: {}."
                    " See: {}".format(feed.bozo_exception, feedparser_related_doc)
                )
                if not ignore_encoding_errors:
                    # then return empty list
                    return li_new_job_offers_id
            else:
                feedparser_related_doc = "{}bozo.html".format(FEEDPARSER_DOC_BASE_URL)
                logging.error(
                    "Feed error is not recognized: {}. Aborting parsing of '{}'".format(
                        feed.bozo_exception, self._build_feed_url()
                    )
                )
                # then return empty list
                return li_new_job_offers_id
        else:
            logging.info("Feed is well-formed. Everything is fine, go on!")

        # save feed metadata
        feed_metadata = self.save_parsing_metadata(feed)

        # test if feed contains entries
        if not len(feed.entries):
            # log everything
            logging.error(
                "RSS feed is empty, no entries (items) found. Feed info: {}.".format(
                    feed_metadata
                )
            )
            # then return empty list
            return li_new_job_offers_id
        elif self.items_to_parse and (len(feed.entries) != self.items_to_parse):
            logging.warning(
                "Number of items ({}) is different from the required: {}.".format(
                    len(feed.entries), self.items_to_parse
                )
            )
        else:
            logging.info("{} items retrieved from the feed.".format(len(feed.entries)))

        # looping on feed entries
        for entry in feed.entries:
            # get the ID cleaning 'link' markup
            try:
                job_id = self.extract_offer_id_from_url(entry.id)
            except AttributeError as err:
                logging.error(
                    "Feed index corrupted: {} - ({})".format(
                        feed.entries.index(entry), err
                    )
                )
                continue

            # if entry's ID is greater than ID stored into the file,
            # that means the offer is more recent and has to be processed.
            #  This default behavior can be ignored with 'only_new_offers=False'
            if job_id > last_id:
                # adding offer's ID to the list of new offers to process
                li_new_job_offers_id.append(entry)
                logging.debug("New offer spotted: {}".format(job_id))
            elif job_id <= last_id and only_new_offers is False:
                li_new_job_offers_id.append(entry)
                logging.debug("Offer is not newer but still added: {}".format(job_id))
            else:
                logging.debug(
                    "Offer older than the latest previous parsed: {}".format(job_id)
                )

        logging.info("{} new offers to add.".format(len(li_new_job_offers_id)))
        return li_new_job_offers_id


# #############################################################################
# ##### Stand alone program ########
# ##################################
if __name__ == "__main__":
    """Standalone execution for quick and dirty use or test"""
    # logging with debug
    logging.basicConfig(level=logging.DEBUG)

    # use module
    crawler = GeorezoRssParser(items_to_parse=1)
    li_offers_to_add = crawler.parse_new_offers(only_new_offers=False)
    print(isinstance(li_offers_to_add, list))

    for i in li_offers_to_add:
        print(i.keys())
        print(i.summary.encode("latin1"))