package ai.platon.scent.tools;

import ai.platon.pulsar.common.AppPaths;
import ai.platon.pulsar.common.NetUtil;
import ai.platon.pulsar.common.ProcessLauncher;
import ai.platon.pulsar.common.Runtimes;
import ai.platon.pulsar.common.browser.Browsers;
import ai.platon.pulsar.common.urls.URLUtils;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.skeleton.common.options.LoadOptions;
import ai.platon.pulsar.skeleton.common.urls.NormURL;
import ai.platon.pulsar.skeleton.context.PulsarContext;
import ai.platon.pulsar.skeleton.crawl.filter.ChainedUrlNormalizer;
import ai.platon.pulsar.skeleton.crawl.filter.ScopedUrlNormalizer;
import ai.platon.pulsar.skeleton.session.PulsarSession;
import ai.platon.scent.BasicScentSession;
import ai.platon.scent.dom.HNormUrl;
import ai.platon.scent.dom.HarvestOptions;
import ai.platon.scent.ql.h2.context.ScentSQLContext;
import ai.platon.scent.ql.h2.context.ScentSQLContexts;
import ai.platon.scent.skeleton.ScentSession;
import java.net.URL;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import kotlin.Metadata;
import kotlin.collections.CollectionsKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.sequences.Sequence;
import kotlin.sequences.SequencesKt;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* compiled from: SimpleCrawler.kt */
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��p\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0002\b\u0003\n\u0002\u0010\u000b\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0002\b\u0004\n\u0002\u0018\u0002\n\u0002\b\u0003\n\u0002\u0018\u0002\n\u0002\b\u0003\n\u0002\u0010\u0002\n\u0002\b\u0003\n\u0002\u0010\u000e\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0010\u001e\n\u0002\u0018\u0002\n\u0002\b\u0003\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0010 \n��\b\u0016\u0018��2\u00020\u0001B\u001b\u0012\b\b\u0002\u0010\u0002\u001a\u00020\u0003\u0012\n\b\u0002\u0010\u0004\u001a\u0004\u0018\u00010\u0005¢\u0006\u0002\u0010\u0006J\u000e\u0010\u001b\u001a\u00020\u001c2\u0006\u0010\u0004\u001a\u00020\u0005J\b\u0010\u001d\u001a\u00020\u001cH\u0016J\u0016\u0010\u001e\u001a\u00020\u001c2\u0006\u0010\u001f\u001a\u00020 2\u0006\u0010!\u001a\u00020\"J\u0016\u0010\u001e\u001a\u00020\u001c2\u0006\u0010\u001f\u001a\u00020 2\u0006\u0010#\u001a\u00020 J\u001c\u0010$\u001a\b\u0012\u0004\u0012\u00020&0%2\u0006\u0010'\u001a\u00020 2\u0006\u0010!\u001a\u00020\"J\u001c\u0010$\u001a\b\u0012\u0004\u0012\u00020&0%2\u0006\u0010'\u001a\u00020 2\u0006\u0010#\u001a\u00020 J\u000e\u0010(\u001a\u00020\u001c2\u0006\u0010)\u001a\u00020*J\u000e\u0010(\u001a\u00020\u001c2\u0006\u0010\u001f\u001a\u00020 J\u000e\u0010+\u001a\u00020\u001c2\u0006\u0010)\u001a\u00020*J\u001c\u0010,\u001a\b\u0012\u0004\u0012\u00020 0-2\u0006\u0010'\u001a\u00020 2\u0006\u0010#\u001a\u00020 R\u000e\u0010\u0007\u001a\u00020\bX\u0082\u0004¢\u0006\u0002\n��R\u0011\u0010\u0002\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\t\u0010\nR\u0014\u0010\u000b\u001a\u00020\f8VX\u0096\u0004¢\u0006\u0006\u001a\u0004\b\u000b\u0010\rR\u0016\u0010\u000e\u001a\n \u0010*\u0004\u0018\u00010\u000f0\u000fX\u0082\u0004¢\u0006\u0002\n��R\u0013\u0010\u0004\u001a\u0004\u0018\u00010\u0005¢\u0006\b\n��\u001a\u0004\b\u0011\u0010\u0012R\u0011\u0010\u0013\u001a\u00020\u0014¢\u0006\b\n��\u001a\u0004\b\u0015\u0010\u0016R\u0014\u0010\u0017\u001a\u00020\u00188VX\u0096\u0004¢\u0006\u0006\u001a\u0004\b\u0019\u0010\u001a¨\u0006."}, d2 = {"Lai/platon/scent/tools/SimpleCrawler;", "Ljava/lang/AutoCloseable;", "context", "Lai/platon/pulsar/skeleton/context/PulsarContext;", "normalizer", "Lai/platon/pulsar/skeleton/crawl/filter/ScopedUrlNormalizer;", "(Lai/platon/pulsar/skeleton/context/PulsarContext;Lai/platon/pulsar/skeleton/crawl/filter/ScopedUrlNormalizer;)V", "closed", "Ljava/util/concurrent/atomic/AtomicBoolean;", "getContext", "()Lai/platon/pulsar/skeleton/context/PulsarContext;", "isActive", "", "()Z", "logger", "Lorg/slf4j/Logger;", "kotlin.jvm.PlatformType", "getNormalizer", "()Lai/platon/pulsar/skeleton/crawl/filter/ScopedUrlNormalizer;", "session", "Lai/platon/scent/BasicScentSession;", "getSession", "()Lai/platon/scent/BasicScentSession;", "sqlContext", "Lai/platon/scent/ql/h2/context/ScentSQLContext;", "getSqlContext", "()Lai/platon/scent/ql/h2/context/ScentSQLContext;", "addUrlNormalizer", "", "close", "load", "url", "", "options", "Lai/platon/pulsar/skeleton/common/options/LoadOptions;", "args", "loadOutPages", "", "Lai/platon/pulsar/persist/WebPage;", "portalUrl", "openBrowser", "path", "Ljava/nio/file/Path;", "openExplorer", "parseOutLinks", "", "scent-boot"})
@SourceDebugExtension({"SMAP\nSimpleCrawler.kt\nKotlin\n*S Kotlin\n*F\n+ 1 SimpleCrawler.kt\nai/platon/scent/tools/SimpleCrawler\n+ 2 _Sequences.kt\nkotlin/sequences/SequencesKt___SequencesKt\n+ 3 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n+ 4 fake.kt\nkotlin/jvm/internal/FakeKt\n*L\n1#1,125:1\n1155#2,3:126\n1155#2,3:129\n766#3:132\n857#3,2:133\n1620#3,3:136\n1549#3:139\n1620#3,3:140\n1549#3:143\n1620#3,3:144\n1#4:135\n*S KotlinDebug\n*F\n+ 1 SimpleCrawler.kt\nai/platon/scent/tools/SimpleCrawler\n*L\n52#1:126,3\n55#1:129,3\n56#1:132\n56#1:133,2\n82#1:136,3\n83#1:139\n83#1:140,3\n97#1:143\n97#1:144,3\n*E\n"})
/* loaded from: input_file:ai/platon/scent/tools/SimpleCrawler.class */
public class SimpleCrawler implements AutoCloseable {

    @NotNull
    private final PulsarContext context;

    @Nullable
    private final ScopedUrlNormalizer normalizer;
    private final Logger logger;

    @NotNull
    private final AtomicBoolean closed;

    @NotNull
    private final BasicScentSession session;

    public SimpleCrawler(@NotNull PulsarContext pulsarContext, @Nullable ScopedUrlNormalizer scopedUrlNormalizer) {
        Intrinsics.checkNotNullParameter(pulsarContext, "context");
        this.context = pulsarContext;
        this.normalizer = scopedUrlNormalizer;
        this.logger = LoggerFactory.getLogger(SimpleCrawler.class);
        this.closed = new AtomicBoolean();
        this.session = getSqlContext().createSession();
        if (this.normalizer != null) {
            ChainedUrlNormalizer.add$default(this.session.getContext().getUrlNormalizer(), this.normalizer, (String) null, 2, (Object) null);
        }
    }

    public /* synthetic */ SimpleCrawler(PulsarContext pulsarContext, ScopedUrlNormalizer scopedUrlNormalizer, int i, DefaultConstructorMarker defaultConstructorMarker) {
        this((i & 1) != 0 ? (PulsarContext) ScentSQLContexts.INSTANCE.create() : pulsarContext, (i & 2) != 0 ? null : scopedUrlNormalizer);
    }

    @NotNull
    public final PulsarContext getContext() {
        return this.context;
    }

    @Nullable
    public final ScopedUrlNormalizer getNormalizer() {
        return this.normalizer;
    }

    public boolean isActive() {
        return !this.closed.get() && this.session.isActive();
    }

    @NotNull
    public ScentSQLContext getSqlContext() {
        ScentSQLContext scentSQLContext = this.context;
        Intrinsics.checkNotNull(scentSQLContext, "null cannot be cast to non-null type ai.platon.scent.ql.h2.context.ScentSQLContext");
        return scentSQLContext;
    }

    @NotNull
    public final BasicScentSession getSession() {
        return this.session;
    }

    public final void addUrlNormalizer(@NotNull ScopedUrlNormalizer scopedUrlNormalizer) {
        Intrinsics.checkNotNullParameter(scopedUrlNormalizer, "normalizer");
        ChainedUrlNormalizer.add$default(this.session.getContext().getUrlNormalizer(), scopedUrlNormalizer, (String) null, 2, (Object) null);
    }

    public final void load(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "url");
        Intrinsics.checkNotNullParameter(str2, "args");
        load(str, (LoadOptions) this.session.options(str2));
    }

    public final void load(@NotNull String str, @NotNull LoadOptions loadOptions) {
        Intrinsics.checkNotNullParameter(str, "url");
        Intrinsics.checkNotNullParameter(loadOptions, "options");
        FeaturedDocument parse = this.session.parse(this.session.load(str));
        parse.absoluteLinks();
        parse.stripScripts();
        Sequence<String> filter = SequencesKt.filter(CollectionsKt.asSequence(FeaturedDocument.select$default(parse, loadOptions.getOutLinkSelector(), 0, 0, new Function1<Element, String>() { // from class: ai.platon.scent.tools.SimpleCrawler$load$1
            @NotNull
            public final String invoke(@NotNull Element element) {
                Intrinsics.checkNotNullParameter(element, "it");
                return element.attr("abs:href");
            }
        }, 6, (Object) null)), new Function1<String, Boolean>() { // from class: ai.platon.scent.tools.SimpleCrawler$load$2
            @NotNull
            public final Boolean invoke(@NotNull String str2) {
                Intrinsics.checkNotNullParameter(str2, "it");
                return Boolean.valueOf(URLUtils.isStandard(str2));
            }
        });
        HashSet hashSet = new HashSet();
        for (String str2 : filter) {
            Intrinsics.checkNotNull(str2);
            hashSet.add(StringsKt.substringBefore$default(str2, ".com", (String) null, 2, (Object) null));
        }
        Sequence filter2 = SequencesKt.filter(CollectionsKt.asSequence(hashSet), new Function1<String, Boolean>() { // from class: ai.platon.scent.tools.SimpleCrawler$load$4
            @NotNull
            public final Boolean invoke(@NotNull String str3) {
                Intrinsics.checkNotNullParameter(str3, "it");
                return Boolean.valueOf(!StringsKt.isBlank(str3));
            }
        });
        HashSet hashSet2 = new HashSet();
        Iterator it = filter2.iterator();
        while (it.hasNext()) {
            hashSet2.add(((String) it.next()) + ".com");
        }
        HashSet hashSet3 = hashSet2;
        ArrayList arrayList = new ArrayList();
        for (Object obj : hashSet3) {
            if (NetUtil.testHttpNetwork(new URL((String) obj))) {
                arrayList.add(obj);
            }
        }
        System.out.println((Object) CollectionsKt.joinToString$default(CollectionsKt.take(arrayList, 10), "\n", (CharSequence) null, (CharSequence) null, 0, (CharSequence) null, new Function1<String, CharSequence>() { // from class: ai.platon.scent.tools.SimpleCrawler$load$7
            @NotNull
            public final CharSequence invoke(@NotNull String str3) {
                Intrinsics.checkNotNullParameter(str3, "it");
                return str3;
            }
        }, 30, (Object) null));
        this.logger.info("Export to: file://{}", this.session.export(parse));
    }

    @NotNull
    public final Collection<WebPage> loadOutPages(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(str2, "args");
        return loadOutPages(str, (LoadOptions) this.session.options(str2));
    }

    @NotNull
    public final Collection<WebPage> loadOutPages(@NotNull String str, @NotNull LoadOptions loadOptions) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(loadOptions, "options");
        WebPage load = this.session.load(str, loadOptions);
        if (!load.getProtocolStatus().isSuccess()) {
            this.logger.warn("Failed to load page | {}", str);
        }
        FeaturedDocument parse = this.session.parse(load);
        parse.absoluteLinks();
        parse.stripScripts();
        this.logger.info("Portal page is exported to: file://" + this.session.export(parse));
        List<String> select$default = FeaturedDocument.select$default(parse, loadOptions.getOutLinkSelector(), 0, 0, new Function1<Element, String>() { // from class: ai.platon.scent.tools.SimpleCrawler$loadOutPages$links$1
            @NotNull
            public final String invoke(@NotNull Element element) {
                Intrinsics.checkNotNullParameter(element, "it");
                return element.attr("abs:href");
            }
        }, 6, (Object) null);
        LinkedHashSet linkedHashSet = new LinkedHashSet();
        for (String str2 : select$default) {
            PulsarSession pulsarSession = this.session;
            Intrinsics.checkNotNull(str2);
            linkedHashSet.add(PulsarSession.DefaultImpls.normalize$default(pulsarSession, str2, loadOptions, false, 4, (Object) null));
        }
        List take = CollectionsKt.take(linkedHashSet, loadOptions.getTopLinks());
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(take, 10));
        Iterator it = take.iterator();
        while (it.hasNext()) {
            arrayList.add(((NormURL) it.next()).getSpec());
        }
        ArrayList arrayList2 = arrayList;
        this.logger.info("Total {} items to load", Integer.valueOf(arrayList2.size()));
        LoadOptions createItemOptions = loadOptions.createItemOptions();
        createItemOptions.setParse(true);
        return this.session.loadAll(arrayList2, createItemOptions);
    }

    @NotNull
    public final List<String> parseOutLinks(@NotNull String str, @NotNull String str2) {
        Intrinsics.checkNotNullParameter(str, "portalUrl");
        Intrinsics.checkNotNullParameter(str2, "args");
        HNormUrl normalize$default = ScentSession.DefaultImpls.normalize$default(this.session, str, this.session.options(str2), false, 4, (Object) null);
        HarvestOptions hOptions = normalize$default.getHOptions();
        List take = CollectionsKt.take(FeaturedDocument.select$default(this.session.parse(this.session.load(normalize$default)), hOptions.getOutLinkSelector(), 0, 0, 6, (Object) null), hOptions.getTopLinks());
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(take, 10));
        Iterator it = take.iterator();
        while (it.hasNext()) {
            arrayList.add(((Element) it.next()).attr("abs:href"));
        }
        return arrayList;
    }

    public final void openExplorer(@NotNull Path path) {
        Intrinsics.checkNotNullParameter(path, "path");
        Runtimes.INSTANCE.exec("explorer.exe " + path);
    }

    public final void openBrowser(@NotNull Path path) {
        Intrinsics.checkNotNullParameter(path, "path");
        openBrowser(path.toString());
    }

    public final void openBrowser(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "url");
        ProcessLauncher.INSTANCE.launch(String.valueOf(Browsers.INSTANCE.searchChromeBinary()), CollectionsKt.listOf(new String[]{str, "--user-data-dir=" + AppPaths.INSTANCE.getTmp("exotic-chrome", new String[0]), "--no-first-run", "--no-default-browser-check"}));
    }

    @Override // java.lang.AutoCloseable
    public void close() {
        if (this.closed.compareAndSet(false, true)) {
        }
    }

    public SimpleCrawler() {
        this(null, null, 3, null);
    }
}
