/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.util;

import crawlercommons.robots.BaseRobotRules;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.hostdb.HostDatum;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SitemapProcessor
extends Configured
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(SitemapProcessor.class);
    public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    public static final String CURRENT_NAME = "current";
    public static final String LOCK_NAME = ".locked";
    public static final String SITEMAP_STRICT_PARSING = "sitemap.strict.parsing";
    public static final String SITEMAP_URL_FILTERING = "sitemap.url.filter";
    public static final String SITEMAP_URL_NORMALIZING = "sitemap.url.normalize";
    public static final String SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT = "sitemap.url.default.sitemap.xml";
    public static final String SITEMAP_OVERWRITE_EXISTING = "sitemap.url.overwrite.existing";
    public static final String SITEMAP_REDIR_MAX = "sitemap.redir.max";
    public static final String SITEMAP_SIZE_MAX = "sitemap.size.max";

    public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter, boolean normalize, int threads) throws Exception {
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("SitemapProcessor: starting");
        FileSystem fs = crawldb.getFileSystem(this.getConf());
        Path old = new Path(crawldb, "old");
        Path current = new Path(crawldb, CURRENT_NAME);
        Path tempCrawlDb = new Path(crawldb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Path lock = new Path(crawldb, LOCK_NAME);
        if (!fs.exists(current)) {
            fs.mkdirs(current);
        }
        LockUtil.createLockFile(fs, lock, false);
        Configuration conf = this.getConf();
        conf.setBoolean(SITEMAP_STRICT_PARSING, strict);
        conf.setBoolean(SITEMAP_URL_FILTERING, filter);
        conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize);
        conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        Job job = Job.getInstance((Configuration)conf, (String)("Nutch SitemapProcessor: " + crawldb.toString()));
        job.setJarByClass(SitemapProcessor.class);
        MultipleInputs.addInputPath((Job)job, (Path)current, SequenceFileInputFormat.class);
        if (sitemapUrlDir != null) {
            MultipleInputs.addInputPath((Job)job, (Path)sitemapUrlDir, KeyValueTextInputFormat.class);
        }
        if (hostdb != null) {
            MultipleInputs.addInputPath((Job)job, (Path)new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class);
            if (conf.getStrings("http.robot.rules.allowlist") != null) {
                LOG.warn("Non-empty property \"http.robot.rules.allowlist\": sitemap discovery via robots.txt is not possible for the listed hosts!");
            }
        }
        FileOutputFormat.setOutputPath((Job)job, (Path)tempCrawlDb);
        job.setOutputFormatClass(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setMapperClass(MultithreadedMapper.class);
        MultithreadedMapper.setMapperClass((Job)job, SitemapMapper.class);
        MultithreadedMapper.setNumberOfThreads((Job)job, (int)threads);
        job.setReducerClass(SitemapReducer.class);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("SitemapProcessor", job);
                LOG.error(message);
                NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
                throw new RuntimeException(message);
            }
            boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
            if (!preserveBackup && fs.exists(old)) {
                fs.delete(old, true);
            } else {
                FSUtils.replace(fs, old, current, true);
            }
            FSUtils.replace(fs, current, tempCrawlDb, true);
            LockUtil.removeLockFile(fs, lock);
            long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
            long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
            long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
            long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
            long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();
            LOG.info("SitemapProcessor: Total records rejected by filters: {}", (Object)filteredRecords);
            LOG.info("SitemapProcessor: Total sitemaps from host name: {}", (Object)fromHostname);
            LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", (Object)fromSeeds);
            LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", (Object)failedFetches);
            LOG.info("SitemapProcessor: Total new sitemap entries added: {}", (Object)newSitemapEntries);
            stopWatch.stop();
            LOG.info("SitemapProcessor: finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("SitemapProcessor_{}", (Object)crawldb.toString(), (Object)e);
            NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
            throw e;
        }
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new SitemapProcessor(), (String[])args);
        System.exit(res);
    }

    public static void usage() {
        System.err.println("Usage:\n SitemapProcessor <crawldb> [-hostdb <hostdb>] [-sitemapUrls <url_dir>] [-threads <threads>] [-force] [-noStrict] [-noFilter] [-noNormalize]\n");
        System.err.println("\t<crawldb>\t\tpath to crawldb where the sitemap urls would be injected");
        System.err.println("\t-hostdb <hostdb>\tpath of a hostdb. Sitemap(s) from these hosts would be downloaded");
        System.err.println("\t-sitemapUrls <url_dir>\tpath to directory with sitemap urls or hostnames");
        System.err.println("\t-threads <threads>\tNumber of threads created per mapper to fetch sitemap urls (default: 8)");
        System.err.println("\t-force\t\t\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
        System.err.println("\t-noStrict\t\tBy default Sitemap parser rejects invalid urls. '-noStrict' disables that.");
        System.err.println("\t-noFilter\t\tturn off URLFilters on urls (optional)");
        System.err.println("\t-noNormalize\t\tturn off URLNormalizer on urls (optional)");
    }

    public int run(String[] args) throws Exception {
        if (args.length < 3) {
            SitemapProcessor.usage();
            return -1;
        }
        Path crawlDb = new Path(args[0]);
        Path hostDb = null;
        Path urlDir = null;
        boolean strict = true;
        boolean filter = true;
        boolean normalize = true;
        int threads = 8;
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-hostdb")) {
                hostDb = new Path(args[++i]);
                LOG.info("SitemapProcessor: hostdb: {}", (Object)hostDb);
                continue;
            }
            if (args[i].equals("-sitemapUrls")) {
                urlDir = new Path(args[++i]);
                LOG.info("SitemapProcessor: sitemap urls dir: {}", (Object)urlDir);
                continue;
            }
            if (args[i].equals("-threads")) {
                threads = Integer.parseInt(args[++i]);
                LOG.info("SitemapProcessor: threads: {}", (Object)threads);
                continue;
            }
            if (args[i].equals("-noStrict")) {
                LOG.info("SitemapProcessor: 'strict' parsing disabled");
                strict = false;
                continue;
            }
            if (args[i].equals("-noFilter")) {
                LOG.info("SitemapProcessor: filtering disabled");
                filter = false;
                continue;
            }
            if (args[i].equals("-noNormalize")) {
                LOG.info("SitemapProcessor: normalizing disabled");
                normalize = false;
                continue;
            }
            LOG.info("SitemapProcessor: Found invalid argument \"{}\"\n", (Object)args[i]);
            SitemapProcessor.usage();
            return -1;
        }
        try {
            this.sitemap(crawlDb, hostDb, urlDir, strict, filter, normalize, threads);
            return 0;
        }
        catch (Exception e) {
            LOG.error("SitemapProcessor: {}", (Object)StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    private static class SitemapReducer
    extends Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        CrawlDatum sitemapDatum = null;
        CrawlDatum originalDatum = null;
        private boolean overwriteExisting = false;

        private SitemapReducer() {
        }

        public void setup(Reducer.Context context) {
            Configuration conf = context.getConfiguration();
            this.overwriteExisting = conf.getBoolean(SitemapProcessor.SITEMAP_OVERWRITE_EXISTING, false);
        }

        public void reduce(Text key, Iterable<CrawlDatum> values, Reducer.Context context) throws IOException, InterruptedException {
            this.sitemapDatum = null;
            this.originalDatum = null;
            for (CrawlDatum curr : values) {
                if (curr.getStatus() == 66) {
                    this.sitemapDatum = new CrawlDatum();
                    this.sitemapDatum.set(curr);
                    continue;
                }
                this.originalDatum = new CrawlDatum();
                this.originalDatum.set(curr);
            }
            if (this.originalDatum != null) {
                if (this.sitemapDatum != null && this.overwriteExisting) {
                    this.originalDatum.setScore(this.sitemapDatum.getScore());
                    this.originalDatum.setFetchInterval(this.sitemapDatum.getFetchInterval());
                    this.originalDatum.setModifiedTime(this.sitemapDatum.getModifiedTime());
                }
                context.getCounter("Sitemap", "existing_sitemap_entries").increment(1L);
                context.write((Object)key, (Object)this.originalDatum);
            } else if (this.sitemapDatum != null) {
                context.getCounter("Sitemap", "new_sitemap_entries").increment(1L);
                this.sitemapDatum.setStatus(1);
                context.write((Object)key, (Object)this.sitemapDatum);
            }
        }
    }

    private static class SitemapMapper
    extends Mapper<Text, Writable, Text, CrawlDatum> {
        private ProtocolFactory protocolFactory = null;
        private boolean strict = true;
        private boolean filter = true;
        private boolean normalize = true;
        private boolean tryDefaultSitemapXml = true;
        private int maxRedir = 3;
        private float minFetchInterval = 60.0f;
        private float maxFetchInterval = 3.1536E7f;
        private URLFilters filters = null;
        private URLNormalizers normalizers = null;
        private CrawlDatum datum = new CrawlDatum();
        private SiteMapParser parser = null;

        private SitemapMapper() {
        }

        public void setup(Mapper.Context context) {
            Configuration conf = context.getConfiguration();
            int maxSize = conf.getInt(SitemapProcessor.SITEMAP_SIZE_MAX, 0x3200000);
            conf.setInt("http.content.limit", maxSize);
            conf.setInt("file.content.limit", maxSize);
            this.protocolFactory = new ProtocolFactory(conf);
            this.filter = conf.getBoolean(SitemapProcessor.SITEMAP_URL_FILTERING, true);
            this.normalize = conf.getBoolean(SitemapProcessor.SITEMAP_URL_NORMALIZING, true);
            this.strict = conf.getBoolean(SitemapProcessor.SITEMAP_STRICT_PARSING, true);
            this.tryDefaultSitemapXml = conf.getBoolean(SitemapProcessor.SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT, true);
            this.maxRedir = conf.getInt(SitemapProcessor.SITEMAP_REDIR_MAX, 3);
            this.parser = new SiteMapParser(this.strict);
            this.minFetchInterval = conf.getFloat("db.fetch.schedule.adaptive.min_interval", 60.0f);
            this.maxFetchInterval = conf.getFloat("db.fetch.schedule.adaptive.max_interval", 3.1536E7f);
            if (this.filter) {
                this.filters = new URLFilters(conf);
            }
            if (this.normalize) {
                this.normalizers = new URLNormalizers(conf, "default");
            }
        }

        public void map(Text key, Writable value, Mapper.Context context) throws IOException, InterruptedException {
            try {
                if (value instanceof CrawlDatum) {
                    context.write((Object)key, (Object)((CrawlDatum)value));
                } else if (value instanceof HostDatum) {
                    this.generateSitemapsFromHostname(key.toString(), context);
                } else if (value instanceof Text) {
                    String url = key.toString();
                    if (url.startsWith("http://") || url.startsWith("https://") || url.startsWith("ftp://") || url.startsWith("file:/")) {
                        if ((url = this.filterNormalize(url)) == null) {
                            context.getCounter("Sitemap", "filtered_records").increment(1L);
                            return;
                        }
                        context.getCounter("Sitemap", "sitemap_seeds").increment(1L);
                        this.generateSitemapUrlDatum(this.protocolFactory.getProtocol(url), url, context);
                    } else {
                        LOG.info("generateSitemapsFromHostname: {}", (Object)key.toString());
                        this.generateSitemapsFromHostname(key.toString(), context);
                    }
                }
            }
            catch (Exception e) {
                LOG.warn("Exception for record {} : {}", (Object)key.toString(), (Object)StringUtils.stringifyException((Throwable)e));
            }
        }

        private String filterNormalize(String url) {
            try {
                if (this.normalizers != null) {
                    url = this.normalizers.normalize(url, "default");
                }
                if (this.filters != null) {
                    url = this.filters.filter(url);
                }
            }
            catch (Exception e) {
                return null;
            }
            return url;
        }

        private void generateSitemapsFromHostname(String host, Mapper.Context context) {
            try {
                String url = this.filterNormalize("http://" + host + "/");
                if (url == null && (url = this.filterNormalize("https://" + host + "/")) == null && (url = this.filterNormalize("ftp://" + host + "/")) == null && (url = this.filterNormalize("file:/" + host + "/")) == null) {
                    context.getCounter("Sitemap", "filtered_records").increment(1L);
                    return;
                }
                BaseRobotRules rules = this.protocolFactory.getProtocol(url).getRobotRules(new Text(url), this.datum, null);
                List sitemaps = rules.getSitemaps();
                if (this.tryDefaultSitemapXml && sitemaps.size() == 0) {
                    sitemaps.add(url + "sitemap.xml");
                }
                for (String sitemap : sitemaps) {
                    context.getCounter("Sitemap", "sitemaps_from_hostname").increment(1L);
                    sitemap = this.filterNormalize(sitemap);
                    if (sitemap == null) {
                        context.getCounter("Sitemap", "filtered_sitemaps_from_hostname").increment(1L);
                        continue;
                    }
                    this.generateSitemapUrlDatum(this.protocolFactory.getProtocol(sitemap), sitemap, context);
                }
            }
            catch (Exception e) {
                LOG.warn("Exception for record {} : {}", (Object)host, (Object)StringUtils.stringifyException((Throwable)e));
            }
        }

        private void generateSitemapUrlDatum(Protocol protocol, String url, Mapper.Context context) throws Exception {
            block22: {
                AbstractSiteMap asm;
                block21: {
                    String[] stuff;
                    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), this.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    for (int maxRedir = this.maxRedir; !output.getStatus().isSuccess() && output.getStatus().isRedirect() && maxRedir > 0 && (url = this.filterNormalize((stuff = output.getStatus().getArgs())[0])) != null; --maxRedir) {
                        output = protocol.getProtocolOutput(new Text(url), this.datum);
                        status = output.getStatus();
                        content = output.getContent();
                    }
                    if (status.getCode() != 1) {
                        context.getCounter("Sitemap", "failed_fetches").increment(1L);
                        LOG.error("Error while fetching the sitemap. Status code: {} for {}", (Object)status.getCode(), (Object)url);
                        return;
                    }
                    asm = this.parser.parseSiteMap(content.getContentType(), content.getContent(), new URL(url));
                    if (!(asm instanceof SiteMap)) break block21;
                    LOG.info("Parsing sitemap file: {}", (Object)asm.getUrl().toString());
                    SiteMap sm = (SiteMap)asm;
                    Collection sitemapUrls = sm.getSiteMapUrls();
                    for (SiteMapURL sitemapUrl : sitemapUrls) {
                        String key;
                        if (this.strict && !sitemapUrl.isValid() || (key = this.filterNormalize(sitemapUrl.getUrl().toString())) == null) continue;
                        CrawlDatum sitemapUrlDatum = new CrawlDatum();
                        sitemapUrlDatum.setStatus(66);
                        float priority = (float)sitemapUrl.getPriority();
                        if (priority > 0.0f) {
                            sitemapUrlDatum.setScore(priority);
                        } else {
                            sitemapUrlDatum.setScore(0.5f);
                        }
                        if (sitemapUrl.getChangeFrequency() != null) {
                            int fetchInterval = -1;
                            switch (sitemapUrl.getChangeFrequency()) {
                                case ALWAYS: {
                                    fetchInterval = 1;
                                    break;
                                }
                                case HOURLY: {
                                    fetchInterval = 3600;
                                    break;
                                }
                                case DAILY: {
                                    fetchInterval = 86400;
                                    break;
                                }
                                case WEEKLY: {
                                    fetchInterval = 604800;
                                    break;
                                }
                                case MONTHLY: {
                                    fetchInterval = 2592000;
                                    break;
                                }
                                case YEARLY: {
                                    fetchInterval = 31536000;
                                    break;
                                }
                                case NEVER: {
                                    fetchInterval = Integer.MAX_VALUE;
                                }
                            }
                            if ((float)fetchInterval > this.maxFetchInterval) {
                                fetchInterval = (int)this.maxFetchInterval;
                            } else if ((float)fetchInterval < this.minFetchInterval) {
                                fetchInterval = (int)this.minFetchInterval;
                            }
                            sitemapUrlDatum.setFetchInterval(fetchInterval);
                        }
                        if (sitemapUrl.getLastModified() != null && sitemapUrl.getLastModified().getTime() <= System.currentTimeMillis()) {
                            sitemapUrlDatum.setModifiedTime(sitemapUrl.getLastModified().getTime());
                        }
                        context.write((Object)new Text(key), (Object)sitemapUrlDatum);
                    }
                    break block22;
                }
                if (!(asm instanceof SiteMapIndex)) break block22;
                SiteMapIndex index = (SiteMapIndex)asm;
                Collection sitemapUrls = index.getSitemaps(true);
                if (sitemapUrls.isEmpty()) {
                    return;
                }
                LOG.info("Parsing sitemap index file: {}", (Object)index.getUrl().toString());
                for (AbstractSiteMap sitemap : sitemapUrls) {
                    String sitemapUrl = this.filterNormalize(sitemap.getUrl().toString());
                    if (sitemapUrl == null) continue;
                    this.generateSitemapUrlDatum(protocol, sitemapUrl, context);
                }
            }
        }
    }
}

