package ai.platon.scent.tools;

import ai.platon.pulsar.common.LogsKt;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.persist.WebDb;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.persist.gora.generated.GWebPage;
import ai.platon.pulsar.skeleton.crawl.common.URLUtil;
import ai.platon.pulsar.skeleton.crawl.filter.ChainedUrlNormalizer;
import ai.platon.scent.ScentContext;
import ai.platon.scent.ScentSession;
import ai.platon.scent.common.MLPaths;
import ai.platon.scent.common.ScentWebPageExtKt;
import ai.platon.scent.common.sites.amazon.AmazonAsinUrlNormalizer;
import ai.platon.scent.common.sites.amazon.AmazonUrls;
import ai.platon.scent.context.support.AbstractScentContext;
import ai.platon.scent.dom.HarvestOptions;
import ai.platon.scent.ml.EncodeOptions;
import ai.platon.scent.ml.data.SimpleDataFrame;
import java.lang.management.ManagementFactory;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import kotlin.Metadata;
import kotlin.collections.CollectionsKt;
import kotlin.collections.SetsKt;
import kotlin.enums.EnumEntries;
import kotlin.enums.EnumEntriesKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.Reflection;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.sequences.Sequence;
import kotlin.sequences.SequencesKt;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.slf4j.Logger;

/* compiled from: ScanningHarvestTaskExecutor.kt */
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��R\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010\u000e\n��\n\u0002\u0010\b\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n\u0002\b\u0003\n\u0002\u0018\u0002\n\u0002\b\n\n\u0002\u0018\u0002\n\u0002\b\u0005\n\u0002\u0010\u0002\n\u0002\b\u0005\n\u0002\u0010\u000b\n��\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n\u0002\b\u0003\u0018��2\u00020\u0001B-\u0012\b\b\u0002\u0010\u0002\u001a\u00020\u0003\u0012\b\b\u0002\u0010\u0004\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0006\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0007\u001a\u00020\b¢\u0006\u0002\u0010\tJ\u0006\u0010\u001f\u001a\u00020 J\u0006\u0010!\u001a\u00020 J8\u0010\"\u001a\u00020 2\u0006\u0010#\u001a\u00020\u00032\b\b\u0002\u0010$\u001a\u00020\u00032\b\b\u0002\u0010%\u001a\u00020&2\u0014\b\u0002\u0010'\u001a\u000e\u0012\u0004\u0012\u00020)\u0012\u0004\u0012\u00020&0(J\u0010\u0010*\u001a\u00020 2\b\b\u0002\u0010#\u001a\u00020\u0003J\u000e\u0010+\u001a\u00020 2\u0006\u0010#\u001a\u00020\u0003R\u000e\u0010\n\u001a\u00020\u000bX\u0082\u0004¢\u0006\u0002\n��R\u0011\u0010\u0007\u001a\u00020\b¢\u0006\b\n��\u001a\u0004\b\f\u0010\rR\u001a\u0010\u000e\u001a\u00020\u000fX\u0086\u000e¢\u0006\u000e\n��\u001a\u0004\b\u0010\u0010\u0011\"\u0004\b\u0012\u0010\u0013R\u0011\u0010\u0014\u001a\u00020\u00038F¢\u0006\u0006\u001a\u0004\b\u0015\u0010\u0016R\u0011\u0010\u0006\u001a\u00020\u0005¢\u0006\b\n��\u001a\u0004\b\u0017\u0010\u0018R\u000e\u0010\u0019\u001a\u00020\u001aX\u0082\u0004¢\u0006\u0002\n��R\u0011\u0010\u001b\u001a\u00020\u00038F¢\u0006\u0006\u001a\u0004\b\u001c\u0010\u0016R\u0011\u0010\u0004\u001a\u00020\u0005¢\u0006\b\n��\u001a\u0004\b\u001d\u0010\u0018R\u0011\u0010\u0002\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\u001e\u0010\u0016¨\u0006,"}, d2 = {"Lai/platon/scent/tools/ScanningHarvestTaskExecutor;", "Lai/platon/scent/tools/HarvestTaskExecutor;", "urlBase", "", "start", "", "limit", "crawler", "Lai/platon/scent/tools/VerboseCrawler;", "(Ljava/lang/String;IILai/platon/scent/tools/VerboseCrawler;)V", "context", "Lai/platon/scent/context/support/AbstractScentContext;", "getCrawler", "()Lai/platon/scent/tools/VerboseCrawler;", "datasetPath", "Ljava/nio/file/Path;", "getDatasetPath", "()Ljava/nio/file/Path;", "setDatasetPath", "(Ljava/nio/file/Path;)V", "domain", "getDomain", "()Ljava/lang/String;", "getLimit", "()I", "logger", "Lorg/slf4j/Logger;", "origin", "getOrigin", "getStart", "getUrlBase", "check", "", "clearAnnotations", "encode", "args", "restrictCss", "annotated", "", "nodeFilter", "Lkotlin/Function1;", "Lorg/jsoup/nodes/Node;", "harvest", "kmeans", "scent-boot"})
@SourceDebugExtension({"SMAP\nScanningHarvestTaskExecutor.kt\nKotlin\n*S Kotlin\n*F\n+ 1 ScanningHarvestTaskExecutor.kt\nai/platon/scent/tools/ScanningHarvestTaskExecutor\n+ 2 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n+ 3 ArraysJVM.kt\nkotlin/collections/ArraysKt__ArraysJVMKt\n+ 4 _Sequences.kt\nkotlin/sequences/SequencesKt___SequencesKt\n*L\n1#1,119:1\n1549#2:120\n1620#2,3:121\n37#3,2:124\n1324#4,3:126\n1324#4,3:129\n*S KotlinDebug\n*F\n+ 1 ScanningHarvestTaskExecutor.kt\nai/platon/scent/tools/ScanningHarvestTaskExecutor\n*L\n43#1:120\n43#1:121,3\n43#1:124,2\n48#1:126,3\n65#1:129,3\n*E\n"})
/* loaded from: input_file:ai/platon/scent/tools/ScanningHarvestTaskExecutor.class */
public final class ScanningHarvestTaskExecutor extends HarvestTaskExecutor {

    @NotNull
    private final String urlBase;
    private final int start;
    private final int limit;

    @NotNull
    private final VerboseCrawler crawler;

    @NotNull
    private final Logger logger;

    @NotNull
    private final AbstractScentContext context;

    @NotNull
    private Path datasetPath;

    /* compiled from: ScanningHarvestTaskExecutor.kt */
    @Metadata(mv = {1, 9, 0}, k = 3, xi = 48)
    /* loaded from: input_file:ai/platon/scent/tools/ScanningHarvestTaskExecutor$EntriesMappings.class */
    public /* synthetic */ class EntriesMappings {
        public static final /* synthetic */ EnumEntries<GWebPage.Field> entries$0 = EnumEntriesKt.enumEntries(GWebPage.Field.values());
    }

    /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
    public ScanningHarvestTaskExecutor(@NotNull String str, int i, int i2, @NotNull VerboseCrawler verboseCrawler) {
        super(verboseCrawler.getSession());
        Intrinsics.checkNotNullParameter(str, "urlBase");
        Intrinsics.checkNotNullParameter(verboseCrawler, "crawler");
        this.urlBase = str;
        this.start = i;
        this.limit = i2;
        this.crawler = verboseCrawler;
        this.logger = LogsKt.getLogger(Reflection.getOrCreateKotlinClass(ScanningHarvestTaskExecutor.class));
        AbstractScentContext context = getSession().getContext();
        Intrinsics.checkNotNull(context, "null cannot be cast to non-null type ai.platon.scent.context.support.AbstractScentContext");
        this.context = context;
        this.datasetPath = HarvestTaskExecutor.Companion.createDatasetPath(this.start, this.limit, StringsKt.substringBefore$default(getDomain(), ".", (String) null, 2, (Object) null));
        ChainedUrlNormalizer.add$default(getSession().getContext().getUrlNormalizer(), new AmazonAsinUrlNormalizer(), (String) null, 2, (Object) null);
    }

    public /* synthetic */ ScanningHarvestTaskExecutor(String str, int i, int i2, VerboseCrawler verboseCrawler, int i3, DefaultConstructorMarker defaultConstructorMarker) {
        this((i3 & 1) != 0 ? "https://www.amazon.com/dp/" : str, (i3 & 2) != 0 ? 0 : i, (i3 & 4) != 0 ? 6000 : i2, (i3 & 8) != 0 ? new VerboseCrawler((ScentContext) null, 1, (DefaultConstructorMarker) null) : verboseCrawler);
    }

    @NotNull
    public final String getUrlBase() {
        return this.urlBase;
    }

    public final int getStart() {
        return this.start;
    }

    public final int getLimit() {
        return this.limit;
    }

    @NotNull
    public final VerboseCrawler getCrawler() {
        return this.crawler;
    }

    @NotNull
    public final String getOrigin() {
        return URLUtil.INSTANCE.getOrigin(this.urlBase);
    }

    @NotNull
    public final String getDomain() {
        String domainName = URLUtil.INSTANCE.getDomainName(this.urlBase);
        return domainName == null ? "unknown" : domainName;
    }

    @NotNull
    public final Path getDatasetPath() {
        return this.datasetPath;
    }

    public final void setDatasetPath(@NotNull Path path) {
        Intrinsics.checkNotNullParameter(path, "<set-?>");
        this.datasetPath = path;
    }

    public final void check() {
        Set minus = SetsKt.minus(CollectionsKt.toSet(EntriesMappings.entries$0), GWebPage.Field.PAGE_MODEL);
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(minus, 10));
        Iterator it = minus.iterator();
        while (it.hasNext()) {
            arrayList.add(((GWebPage.Field) it.next()).toString());
        }
        Iterator scan = this.context.getWebDb().scan(this.urlBase, (String[]) arrayList.toArray(new String[0]));
        final String domainName = URLUtil.INSTANCE.getDomainName(this.urlBase);
        if (domainName == null) {
            throw new IllegalArgumentException("Invalid urlBase: " + this.urlBase);
        }
        int i = 0;
        for (Object obj : SequencesKt.take(SequencesKt.drop(SequencesKt.filter(SequencesKt.filter(SequencesKt.asSequence(scan), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$check$sequence$1
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                String url = webPage.getUrl();
                Intrinsics.checkNotNullExpressionValue(url, "getUrl(...)");
                return Boolean.valueOf(StringsKt.contains$default(url, domainName, false, 2, (Object) null));
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$check$sequence$2
            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return Boolean.valueOf(webPage.getContentLength() > 800000);
            }
        }), this.start), this.limit)) {
            int i2 = i;
            i++;
            if (i2 < 0) {
                CollectionsKt.throwIndexOverflow();
            }
            WebPage webPage = (WebPage) obj;
            FeaturedDocument parse = getSession().parse(webPage);
            String url = webPage.getUrl();
            Intrinsics.checkNotNullExpressionValue(url, "getUrl(...)");
            if (!StringsKt.startsWith$default(url, this.urlBase, false, 2, (Object) null)) {
                AmazonUrls amazonUrls = AmazonUrls.INSTANCE;
                String url2 = webPage.getUrl();
                Intrinsics.checkNotNullExpressionValue(url2, "getUrl(...)");
                String normalizeAsinUrl = amazonUrls.normalizeAsinUrl(url2);
                if (normalizeAsinUrl != null) {
                    WebPage newWebPage = WebPage.newWebPage(normalizeAsinUrl, webPage.getConf(), webPage.getUrl());
                    Intrinsics.checkNotNullExpressionValue(newWebPage, "newWebPage(...)");
                    newWebPage.unsafeCloneGPage(webPage);
                    WebDb.put$default(this.context.getWebDb(), newWebPage, false, 2, (Object) null);
                }
            }
            int i3 = i2 + 1;
            long contentLength = webPage.getContentLength();
            long persistedContentLength = webPage.getPersistedContentLength();
            Map mlLabels = ScentWebPageExtKt.getMlLabels(webPage);
            if (mlLabels != null) {
                mlLabels.values();
            }
            webPage.getUrl();
            System.out.println((Object) (i3 + ". " + contentLength + " | " + i3 + " | " + persistedContentLength + " | " + i3));
            System.out.println((Object) parse.getBaseURI());
        }
    }

    public final void clearAnnotations() {
        int i = 0;
        for (Object obj : SequencesKt.take(SequencesKt.drop(SequencesKt.asSequence(this.context.getWebDb().scan(this.urlBase)), this.start), this.limit)) {
            int i2 = i;
            i++;
            if (i2 < 0) {
                CollectionsKt.throwIndexOverflow();
            }
            WebPage webPage = (WebPage) obj;
            ScentWebPageExtKt.clearMLLabels(webPage);
            getSession().persist(webPage);
        }
    }

    public final void encode(@NotNull String str, @NotNull final String str2, final boolean z, @NotNull Function1<? super Node, Boolean> function1) {
        Intrinsics.checkNotNullParameter(str, "args");
        Intrinsics.checkNotNullParameter(str2, "restrictCss");
        Intrinsics.checkNotNullParameter(function1, "nodeFilter");
        Files.deleteIfExists(this.datasetPath);
        if (URLUtil.INSTANCE.getDomainName(this.urlBase) == null) {
            throw new IllegalArgumentException("Invalid urlBase: " + this.urlBase);
        }
        final HarvestOptions options = getSession().options(str);
        Sequence mapNotNull = SequencesKt.mapNotNull(SequencesKt.map(SequencesKt.filter(SequencesKt.filter(SequencesKt.filter(SequencesKt.filter(getSession().scan(this.urlBase, options, this.start, this.limit, this.start, this.limit), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$encode$rootElements$1
            /* JADX INFO: Access modifiers changed from: package-private */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                String url = webPage.getUrl();
                Intrinsics.checkNotNullExpressionValue(url, "getUrl(...)");
                return Boolean.valueOf(StringsKt.startsWith$default(url, ScanningHarvestTaskExecutor.this.getUrlBase(), false, 2, (Object) null));
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$encode$rootElements$2
            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return Boolean.valueOf(webPage.getUrl().length() <= 200);
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$encode$rootElements$3
            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return Boolean.valueOf(webPage.getPersistedContentLength() > 800000);
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$encode$rootElements$4
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                boolean z2;
                boolean z3;
                Intrinsics.checkNotNullParameter(webPage, "it");
                if (z) {
                    Map mlLabels = ScentWebPageExtKt.getMlLabels(webPage);
                    if (mlLabels != null) {
                        z3 = !mlLabels.isEmpty();
                    } else {
                        z3 = false;
                    }
                    if (!z3) {
                        z2 = false;
                        return Boolean.valueOf(z2);
                    }
                }
                z2 = true;
                return Boolean.valueOf(z2);
            }
        }), new Function1<WebPage, FeaturedDocument>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$encode$rootElements$5
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final FeaturedDocument invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return ScentSession.DefaultImpls.parse$default(ScanningHarvestTaskExecutor.this.getSession(), webPage, options, false, 4, (Object) null);
            }
        }), new Function1<FeaturedDocument, Element>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$encode$rootElements$6
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @Nullable
            public final Element invoke(@NotNull FeaturedDocument featuredDocument) {
                Intrinsics.checkNotNullParameter(featuredDocument, "it");
                return featuredDocument.selectFirstOrNull(str2);
            }
        });
        SimpleDataFrame encodeForElements = this.crawler.encodeForElements(SequencesKt.asIterable(mapNotNull), new EncodeOptions(this.datasetPath, true, (List) null, 0, 0, 28, (DefaultConstructorMarker) null), function1);
        if (Files.exists(this.datasetPath, new LinkOption[0])) {
            MLPaths.INSTANCE.copyToLearnUnsupervised(this.datasetPath);
        } else {
            System.out.println((Object) ("Dataset is not saved to " + this.datasetPath));
        }
        this.logger.info("Dataset size: " + encodeForElements.getRecordCount() + ", total documents: " + encodeForElements.getSize() + ", dataset exported:\n" + this.datasetPath);
        this.logger.info("All done.");
    }

    public static /* synthetic */ void encode$default(ScanningHarvestTaskExecutor scanningHarvestTaskExecutor, String str, String str2, boolean z, Function1 function1, int i, Object obj) {
        if ((i & 2) != 0) {
            str2 = "body";
        }
        if ((i & 4) != 0) {
            z = false;
        }
        if ((i & 8) != 0) {
            function1 = new Function1<Node, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$encode$1
                @NotNull
                public final Boolean invoke(@NotNull Node node) {
                    Intrinsics.checkNotNullParameter(node, "it");
                    return true;
                }
            };
        }
        scanningHarvestTaskExecutor.encode(str, str2, z, function1);
    }

    public final void kmeans(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "args");
        this.logger.info("Use project scent-spark to cluster the dataset");
        this.logger.info("https://github.com/galaxyeye/scent-spark/blob/main/src/main/java/ai/platon/scent/ml/clustering/DomKMeans.java");
    }

    public final void harvest(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "args");
        final HarvestOptions options = getSession().options(str + " -diagnose -vj -trustSamples");
        final String domainName = URLUtil.INSTANCE.getDomainName(this.urlBase);
        if (domainName == null) {
            throw new IllegalArgumentException("Invalid urlBase: " + this.urlBase);
        }
        Sequence map = SequencesKt.map(SequencesKt.filter(getSession().scan(this.urlBase, options, this.start, this.limit, this.start, this.limit), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$harvest$documents$1
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                String url = webPage.getUrl();
                Intrinsics.checkNotNullExpressionValue(url, "getUrl(...)");
                return Boolean.valueOf(StringsKt.contains$default(url, domainName, false, 2, (Object) null));
            }
        }), new Function1<WebPage, FeaturedDocument>() { // from class: ai.platon.scent.tools.ScanningHarvestTaskExecutor$harvest$documents$2
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final FeaturedDocument invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return ScentSession.DefaultImpls.parse$default(ScanningHarvestTaskExecutor.this.getSession(), webPage, options, false, 4, (Object) null);
            }
        });
        this.logger.info("{}", ManagementFactory.getRuntimeMXBean().getInputArguments());
        this.crawler.harvest(map, options);
    }

    public static /* synthetic */ void harvest$default(ScanningHarvestTaskExecutor scanningHarvestTaskExecutor, String str, int i, Object obj) {
        if ((i & 1) != 0) {
            str = "";
        }
        scanningHarvestTaskExecutor.harvest(str);
    }

    public ScanningHarvestTaskExecutor() {
        this(null, 0, 0, null, 15, null);
    }
}
