package ai.platon.scent.tools;

import ai.platon.pulsar.common.AppPaths;
import ai.platon.pulsar.common.LogsKt;
import ai.platon.pulsar.dom.FeaturedDocument;
import ai.platon.pulsar.persist.WebPage;
import ai.platon.pulsar.persist.gora.generated.GWebPage;
import ai.platon.pulsar.skeleton.crawl.common.URLUtil;
import ai.platon.pulsar.skeleton.crawl.filter.ChainedUrlNormalizer;
import ai.platon.scent.BasicScentSession;
import ai.platon.scent.ScentContext;
import ai.platon.scent.ScentSession;
import ai.platon.scent.common.ScentWebPageExtKt;
import ai.platon.scent.common.sites.amazon.AmazonAsinUrlNormalizer;
import ai.platon.scent.dom.HarvestOptions;
import ai.platon.scent.ml.EncodeOptions;
import ai.platon.scent.ml.data.FrameNode;
import ai.platon.scent.ml.data.SimpleDataFrame;
import ai.platon.scent.view.builder.PredicateResult2HTMLBuilder;
import java.lang.management.ManagementFactory;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.time.MonthDay;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import kotlin.Metadata;
import kotlin.collections.CollectionsKt;
import kotlin.collections.SetsKt;
import kotlin.enums.EnumEntries;
import kotlin.enums.EnumEntriesKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.Reflection;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.sequences.Sequence;
import kotlin.sequences.SequencesKt;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;

/* compiled from: HarvestTaskExecutor.kt */
@Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"��D\n\u0002\u0018\u0002\n\u0002\u0010��\n��\n\u0002\u0010\u000e\n��\n\u0002\u0010\b\n\u0002\b\u0003\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0007\n\u0002\u0018\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0004\n\u0002\u0010\u0002\n\u0002\b\u0007\n\u0002\u0010\u000b\n\u0002\b\u0004\u0018�� %2\u00020\u0001:\u0001%B#\u0012\b\b\u0002\u0010\u0002\u001a\u00020\u0003\u0012\b\b\u0002\u0010\u0004\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0006\u001a\u00020\u0005¢\u0006\u0002\u0010\u0007J\u0006\u0010\u0019\u001a\u00020\u001aJ\u0006\u0010\u001b\u001a\u00020\u001aJ\u001a\u0010\u001c\u001a\u00020\u001a2\u0006\u0010\u001d\u001a\u00020\u000b2\n\b\u0002\u0010\u001e\u001a\u0004\u0018\u00010\u000bJ\u0018\u0010\u001f\u001a\u00020\u001a2\u0006\u0010 \u001a\u00020\u00032\b\b\u0002\u0010!\u001a\u00020\"J\u0010\u0010#\u001a\u00020\u001a2\b\b\u0002\u0010 \u001a\u00020\u0003J\u000e\u0010$\u001a\u00020\u001a2\u0006\u0010 \u001a\u00020\u0003R\u000e\u0010\b\u001a\u00020\tX\u0082\u0004¢\u0006\u0002\n��R\u001a\u0010\n\u001a\u00020\u000bX\u0086\u000e¢\u0006\u000e\n��\u001a\u0004\b\f\u0010\r\"\u0004\b\u000e\u0010\u000fR\u0011\u0010\u0006\u001a\u00020\u0005¢\u0006\b\n��\u001a\u0004\b\u0010\u0010\u0011R\u000e\u0010\u0012\u001a\u00020\u0013X\u0082\u0004¢\u0006\u0002\n��R\u000e\u0010\u0014\u001a\u00020\u0015X\u0082\u0004¢\u0006\u0002\n��R\u0011\u0010\u0004\u001a\u00020\u0005¢\u0006\b\n��\u001a\u0004\b\u0016\u0010\u0011R\u0011\u0010\u0002\u001a\u00020\u0003¢\u0006\b\n��\u001a\u0004\b\u0017\u0010\u0018¨\u0006&"}, d2 = {"Lai/platon/scent/tools/HarvestTaskExecutor;", "", "urlBase", "", "start", "", "limit", "(Ljava/lang/String;II)V", "crawler", "Lai/platon/scent/tools/VerboseCrawler;", "datasetPath", "Ljava/nio/file/Path;", "getDatasetPath", "()Ljava/nio/file/Path;", "setDatasetPath", "(Ljava/nio/file/Path;)V", "getLimit", "()I", "logger", "Lorg/slf4j/Logger;", "session", "Lai/platon/scent/BasicScentSession;", "getStart", "getUrlBase", "()Ljava/lang/String;", "check", "", "clearAnnotations", "datasetToHTML", "clusteringResultDir", "outputPath", "encode", "args", "annotated", "", "harvest", "kmeans", "Companion", "scent-boot"})
@SourceDebugExtension({"SMAP\nHarvestTaskExecutor.kt\nKotlin\n*S Kotlin\n*F\n+ 1 HarvestTaskExecutor.kt\nai/platon/scent/tools/HarvestTaskExecutor\n+ 2 _Collections.kt\nkotlin/collections/CollectionsKt___CollectionsKt\n+ 3 ArraysJVM.kt\nkotlin/collections/ArraysKt__ArraysJVMKt\n+ 4 _Sequences.kt\nkotlin/sequences/SequencesKt___SequencesKt\n+ 5 fake.kt\nkotlin/jvm/internal/FakeKt\n*L\n1#1,125:1\n1549#2:126\n1620#2,3:127\n37#3,2:130\n1324#4,3:132\n1324#4,3:135\n1#5:138\n*S KotlinDebug\n*F\n+ 1 HarvestTaskExecutor.kt\nai/platon/scent/tools/HarvestTaskExecutor\n*L\n47#1:126\n47#1:127,3\n47#1:130,2\n52#1:132,3\n61#1:135,3\n*E\n"})
/* loaded from: input_file:ai/platon/scent/tools/HarvestTaskExecutor.class */
public final class HarvestTaskExecutor {

    @NotNull
    public static final Companion Companion = new Companion(null);

    @NotNull
    private final String urlBase;
    private final int start;
    private final int limit;

    @NotNull
    private final Logger logger;

    @NotNull
    private final VerboseCrawler crawler;

    @NotNull
    private final BasicScentSession session;

    @NotNull
    private Path datasetPath;

    /* compiled from: HarvestTaskExecutor.kt */
    @Metadata(mv = {1, 9, 0}, k = 1, xi = 48, d1 = {"�� \n\u0002\u0018\u0002\n\u0002\u0010��\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010\b\n\u0002\b\u0002\n\u0002\u0010\u000e\n��\b\u0086\u0003\u0018��2\u00020\u0001B\u0007\b\u0002¢\u0006\u0002\u0010\u0002J \u0010\u0003\u001a\u00020\u00042\u0006\u0010\u0005\u001a\u00020\u00062\u0006\u0010\u0007\u001a\u00020\u00062\b\b\u0002\u0010\b\u001a\u00020\t¨\u0006\n"}, d2 = {"Lai/platon/scent/tools/HarvestTaskExecutor$Companion;", "", "()V", "createDatasetPath", "Ljava/nio/file/Path;", "start", "", "limit", "ident", "", "scent-boot"})
    /* loaded from: input_file:ai/platon/scent/tools/HarvestTaskExecutor$Companion.class */
    public static final class Companion {
        private Companion() {
        }

        @NotNull
        public final Path createDatasetPath(int i, int i2, @NotNull String str) {
            Intrinsics.checkNotNullParameter(str, "ident");
            return AppPaths.INSTANCE.getProcTmpTmp("ml", new String[]{(str.length() == 0 ? "dataset." : "dataset." + str + ".") + MonthDay.now().getMonthValue() + "." + MonthDay.now().getDayOfMonth() + "." + i + "-" + i2 + ".csv"});
        }

        public static /* synthetic */ Path createDatasetPath$default(Companion companion, int i, int i2, String str, int i3, Object obj) {
            if ((i3 & 4) != 0) {
                str = "";
            }
            return companion.createDatasetPath(i, i2, str);
        }

        public /* synthetic */ Companion(DefaultConstructorMarker defaultConstructorMarker) {
            this();
        }
    }

    /* compiled from: HarvestTaskExecutor.kt */
    @Metadata(mv = {1, 9, 0}, k = 3, xi = 48)
    /* loaded from: input_file:ai/platon/scent/tools/HarvestTaskExecutor$EntriesMappings.class */
    public /* synthetic */ class EntriesMappings {
        public static final /* synthetic */ EnumEntries<GWebPage.Field> entries$0 = EnumEntriesKt.enumEntries(GWebPage.Field.values());
    }

    public HarvestTaskExecutor(@NotNull String str, int i, int i2) {
        Intrinsics.checkNotNullParameter(str, "urlBase");
        this.urlBase = str;
        this.start = i;
        this.limit = i2;
        this.logger = LogsKt.getLogger(Reflection.getOrCreateKotlinClass(HarvestTaskExecutor.class));
        this.crawler = new VerboseCrawler((ScentContext) null, 1, (DefaultConstructorMarker) null);
        this.session = this.crawler.getSession();
        this.datasetPath = Companion.createDatasetPath$default(Companion, this.start, this.limit, null, 4, null);
        ChainedUrlNormalizer.add$default(this.session.getContext().getUrlNormalizer(), new AmazonAsinUrlNormalizer(), (String) null, 2, (Object) null);
    }

    public /* synthetic */ HarvestTaskExecutor(String str, int i, int i2, int i3, DefaultConstructorMarker defaultConstructorMarker) {
        this((i3 & 1) != 0 ? "https://www.amazon.com/dp/" : str, (i3 & 2) != 0 ? 0 : i, (i3 & 4) != 0 ? 6000 : i2);
    }

    @NotNull
    public final String getUrlBase() {
        return this.urlBase;
    }

    public final int getStart() {
        return this.start;
    }

    public final int getLimit() {
        return this.limit;
    }

    @NotNull
    public final Path getDatasetPath() {
        return this.datasetPath;
    }

    public final void setDatasetPath(@NotNull Path path) {
        Intrinsics.checkNotNullParameter(path, "<set-?>");
        this.datasetPath = path;
    }

    public final void check() {
        Set minus = SetsKt.minus(CollectionsKt.toSet(EntriesMappings.entries$0), GWebPage.Field.PAGE_MODEL);
        ArrayList arrayList = new ArrayList(CollectionsKt.collectionSizeOrDefault(minus, 10));
        Iterator it = minus.iterator();
        while (it.hasNext()) {
            arrayList.add(((GWebPage.Field) it.next()).toString());
        }
        Iterator scan = this.session.getContext().getWebDb().scan(this.urlBase, (String[]) arrayList.toArray(new String[0]));
        final String domainName = URLUtil.INSTANCE.getDomainName(this.urlBase);
        if (domainName == null) {
            throw new IllegalArgumentException("Invalid urlBase: " + this.urlBase);
        }
        int i = 0;
        for (Object obj : SequencesKt.take(SequencesKt.drop(SequencesKt.filter(SequencesKt.filter(SequencesKt.asSequence(scan), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$check$sequence$1
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                String url = webPage.getUrl();
                Intrinsics.checkNotNullExpressionValue(url, "getUrl(...)");
                return Boolean.valueOf(StringsKt.contains$default(url, domainName, false, 2, (Object) null));
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$check$sequence$2
            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                return Boolean.valueOf(webPage.getContentLength() > 800000);
            }
        }), this.start), this.limit)) {
            int i2 = i;
            i++;
            if (i2 < 0) {
                CollectionsKt.throwIndexOverflow();
            }
            WebPage webPage = (WebPage) obj;
            int i3 = i2 + 1;
            long contentLength = webPage.getContentLength();
            long persistedContentLength = webPage.getPersistedContentLength();
            Map mlLabels = ScentWebPageExtKt.getMlLabels(webPage);
            if (mlLabels != null) {
                mlLabels.values();
            }
            webPage.getUrl();
            System.out.println((Object) (i3 + ". " + contentLength + " | " + i3 + " | " + persistedContentLength + " | " + i3));
            System.out.println((Object) this.session.parse(webPage).getBaseURI());
        }
    }

    public final void clearAnnotations() {
        int i = 0;
        for (Object obj : SequencesKt.take(SequencesKt.drop(SequencesKt.asSequence(this.session.getContext().getWebDb().scan(this.urlBase)), this.start), this.limit)) {
            int i2 = i;
            i++;
            if (i2 < 0) {
                CollectionsKt.throwIndexOverflow();
            }
            WebPage webPage = (WebPage) obj;
            ScentWebPageExtKt.clearMLLabels(webPage);
            this.session.persist(webPage);
        }
    }

    public final void encode(@NotNull String str, final boolean z) {
        Intrinsics.checkNotNullParameter(str, "args");
        Files.deleteIfExists(this.datasetPath);
        final String domainName = URLUtil.INSTANCE.getDomainName(this.urlBase);
        if (domainName == null) {
            throw new IllegalArgumentException("Invalid urlBase: " + this.urlBase);
        }
        final HarvestOptions options = this.session.options(str);
        SimpleDataFrame encodeElements = this.crawler.encodeElements(SequencesKt.asIterable(SequencesKt.mapNotNull(SequencesKt.map(SequencesKt.filter(SequencesKt.filter(this.session.scan(this.urlBase, options, this.start, this.limit, this.start, this.limit), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$encode$rootElements$1
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                String url = webPage.getUrl();
                Intrinsics.checkNotNullExpressionValue(url, "getUrl(...)");
                return Boolean.valueOf(StringsKt.contains$default(url, domainName, false, 2, (Object) null));
            }
        }), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$encode$rootElements$2
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                boolean z2;
                boolean z3;
                Intrinsics.checkNotNullParameter(webPage, "it");
                if (z) {
                    Map mlLabels = ScentWebPageExtKt.getMlLabels(webPage);
                    if (mlLabels != null) {
                        z3 = !mlLabels.isEmpty();
                    } else {
                        z3 = false;
                    }
                    if (!z3) {
                        z2 = false;
                        return Boolean.valueOf(z2);
                    }
                }
                z2 = true;
                return Boolean.valueOf(z2);
            }
        }), new Function1<WebPage, FeaturedDocument>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$encode$rootElements$3
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final FeaturedDocument invoke(@NotNull WebPage webPage) {
                ScentSession scentSession;
                Intrinsics.checkNotNullParameter(webPage, "it");
                scentSession = HarvestTaskExecutor.this.session;
                return ScentSession.DefaultImpls.parse$default(scentSession, webPage, options, false, 4, (Object) null);
            }
        }), new Function1<FeaturedDocument, Element>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$encode$rootElements$4
            @Nullable
            public final Element invoke(@NotNull FeaturedDocument featuredDocument) {
                Intrinsics.checkNotNullParameter(featuredDocument, "it");
                return featuredDocument.selectFirstOrNull("body");
            }
        })), new EncodeOptions(this.datasetPath, true, (List) null, 0, 0, 28, (DefaultConstructorMarker) null));
        int i = 0;
        Iterator it = encodeElements.getNodes().iterator();
        while (it.hasNext()) {
            i += ((FrameNode) it.next()).getPoints().size();
        }
        this.logger.info("Dataset size: " + i + ", total documents: " + encodeElements.getSize() + ", dataset exported:\n" + this.datasetPath);
        this.logger.info("All done.");
    }

    public static /* synthetic */ void encode$default(HarvestTaskExecutor harvestTaskExecutor, String str, boolean z, int i, Object obj) {
        if ((i & 2) != 0) {
            z = false;
        }
        harvestTaskExecutor.encode(str, z);
    }

    public final void kmeans(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "args");
        this.logger.info("Use project scent-spark to cluster the dataset");
        this.logger.info("https://github.com/galaxyeye/scent-spark/blob/17ed245a6a31d4014f10d7f00d35362d46109e0c/src/main/java/ai/platon/scent/ml/clustering/DomKMeans.java");
    }

    public final void datasetToHTML(@NotNull Path path, @Nullable Path path2) {
        Intrinsics.checkNotNullParameter(path, "clusteringResultDir");
        if (!Files.exists(path, new LinkOption[0])) {
            this.logger.info("Clustering result not found | " + path);
            return;
        }
        List list = (List) Files.list(path).collect(Collectors.toList());
        Intrinsics.checkNotNull(list);
        this.logger.info("Dataset converted to HTML | " + new PredicateResult2HTMLBuilder(list, path2).build());
    }

    public static /* synthetic */ void datasetToHTML$default(HarvestTaskExecutor harvestTaskExecutor, Path path, Path path2, int i, Object obj) {
        if ((i & 2) != 0) {
            path2 = null;
        }
        harvestTaskExecutor.datasetToHTML(path, path2);
    }

    public final void harvest(@NotNull String str) {
        Intrinsics.checkNotNullParameter(str, "args");
        final HarvestOptions options = this.session.options(str + " -diagnose -vj -trustSamples");
        final String domainName = URLUtil.INSTANCE.getDomainName(this.urlBase);
        if (domainName == null) {
            throw new IllegalArgumentException("Invalid urlBase: " + this.urlBase);
        }
        Sequence map = SequencesKt.map(SequencesKt.filter(this.session.scan(this.urlBase, options, this.start, this.limit, this.start, this.limit), new Function1<WebPage, Boolean>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$harvest$documents$1
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final Boolean invoke(@NotNull WebPage webPage) {
                Intrinsics.checkNotNullParameter(webPage, "it");
                String url = webPage.getUrl();
                Intrinsics.checkNotNullExpressionValue(url, "getUrl(...)");
                return Boolean.valueOf(StringsKt.contains$default(url, domainName, false, 2, (Object) null));
            }
        }), new Function1<WebPage, FeaturedDocument>() { // from class: ai.platon.scent.tools.HarvestTaskExecutor$harvest$documents$2
            /* JADX INFO: Access modifiers changed from: package-private */
            /* JADX WARN: 'super' call moved to the top of the method (can break code semantics) */
            {
                super(1);
            }

            @NotNull
            public final FeaturedDocument invoke(@NotNull WebPage webPage) {
                ScentSession scentSession;
                Intrinsics.checkNotNullParameter(webPage, "it");
                scentSession = HarvestTaskExecutor.this.session;
                return ScentSession.DefaultImpls.parse$default(scentSession, webPage, options, false, 4, (Object) null);
            }
        });
        this.logger.info("{}", ManagementFactory.getRuntimeMXBean().getInputArguments());
        this.crawler.harvest(map, options);
    }

    public static /* synthetic */ void harvest$default(HarvestTaskExecutor harvestTaskExecutor, String str, int i, Object obj) {
        if ((i & 1) != 0) {
            str = "";
        }
        harvestTaskExecutor.harvest(str);
    }

    public HarvestTaskExecutor() {
        this(null, 0, 0, 7, null);
    }
}
