sibedas/app/Jobs/ScrapingDataJob.php

<?php

namespace App\Jobs;

use App\Models\BigdataResume;
use App\Models\ImportDatasource;
use App\Models\PbgTask;
use App\Services\ServiceGoogleSheet;
use App\Services\ServicePbgTask;
use App\Services\ServiceTabPbgTask;
use App\Services\ServiceTokenSIMBG;
use GuzzleHttp\Client;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;

class ScrapingDataJob implements ShouldQueue
{
    use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;

    /**
     * Create a new job instance.
     */
    public function __construct()
    {
        // Use dedicated scraping queue
        $this->queue = 'scraping';
    }

    /**
     * Execute the job with optimized schema:
     * 1. Scrape Google Sheet first
     * 2. Scrape PBG Task to get parent data
     * 3. Loop through parent tasks to scrape details via ServiceTabPbgTask
     */
    public function handle()
    {
        $import_datasource = null;
        $failed_uuid = null;
        $processedTasks = 0;
        $totalTasks = 0;

        try {
            Log::info("=== SCRAPING DATA JOB STARTED ===");

            // Initialize services
            $service_google_sheet = app(ServiceGoogleSheet::class);
            $service_pbg_task = app(ServicePbgTask::class);
            $service_tab_pbg_task = app(ServiceTabPbgTask::class);

            // Create ImportDatasource record
            $import_datasource = ImportDatasource::create([
                'message' => 'Starting optimized scraping process...',
                'response_body' => null,
                'status' => 'processing',
                'start_time' => now(),
                'failed_uuid' => null
            ]);

            Log::info("ImportDatasource created", ['id' => $import_datasource->id]);

            // STEP 1: Scrape Google Sheet data first
            Log::info("=== STEP 1: SCRAPING GOOGLE SHEET ===");
            $import_datasource->update(['message' => 'Scraping Google Sheet data...']);

            // $service_google_sheet->run_service();
            Log::info("Google Sheet scraping completed successfully");

            // STEP 2: Scrape PBG Task to get parent data
            Log::info("=== STEP 2: SCRAPING PBG TASK PARENT DATA ===");
            $import_datasource->update(['message' => 'Scraping PBG Task parent data...']);

            $service_pbg_task->run_service();
            Log::info("PBG Task parent data scraping completed");

            // STEP 3: Get all PBG tasks for detail scraping
            $totalTasks = PbgTask::count();
            Log::info("=== STEP 3: SCRAPING PBG TASK DETAILS ===", [
                'total_tasks' => $totalTasks
            ]);

            $import_datasource->update([
                'message' => "Scraping details for {$totalTasks} PBG tasks..."
            ]);

            // Process tasks in chunks for memory efficiency
            $chunkSize = 100;
            $processedTasks = 0;

            PbgTask::orderBy('id')->chunk($chunkSize, function ($pbg_tasks) use (
                $service_tab_pbg_task,
                &$processedTasks,
                $totalTasks,
                $import_datasource,
                &$failed_uuid
            ) {
                foreach ($pbg_tasks as $pbg_task) {
                    try {
                        // Scrape all details for this task
                        $this->processTaskDetails($service_tab_pbg_task, $pbg_task->uuid);

                        $processedTasks++;

                        // Update progress every 10 tasks
                        if ($processedTasks % 10 === 0) {
                            $progress = round(($processedTasks / $totalTasks) * 100, 2);
                            Log::info("Progress update", [
                                'processed' => $processedTasks,
                                'total' => $totalTasks,
                                'progress' => "{$progress}%"
                            ]);

                            $import_datasource->update([
                                'message' => "Processing details: {$processedTasks}/{$totalTasks} ({$progress}%)"
                            ]);
                        }

                    } catch (\Exception $e) {
                        Log::warning("Failed to process task details", [
                            'uuid' => $pbg_task->uuid,
                            'error' => $e->getMessage()
                        ]);

                        // Store failed UUID but continue processing
                        $failed_uuid = $pbg_task->uuid;

                        // Only stop if it's a critical error
                        if ($this->isCriticalError($e)) {
                            throw $e;
                        }
                    }
                }
            });

            Log::info("Task details scraping completed", [
                'processed_tasks' => $processedTasks,
                'total_tasks' => $totalTasks
            ]);

            // STEP 4: Generate BigData Resume
            Log::info("=== STEP 4: GENERATING BIGDATA RESUME ===");
            $import_datasource->update(['message' => 'Generating BigData resume...']);

            BigdataResume::generateResumeData($import_datasource->id, date('Y'), "simbg");

            Log::info("BigData resume generated successfully");

            // Update final status
            $import_datasource->update([
                'status' => 'success',
                'message' => "Scraping completed successfully. Processed {$processedTasks}/{$totalTasks} tasks.",
                'finish_time' => now(),
                'failed_uuid' => $failed_uuid // Store last failed UUID if any
            ]);

            Log::info("=== SCRAPING DATA JOB COMPLETED SUCCESSFULLY ===", [
                'import_datasource_id' => $import_datasource->id,
                'processed_tasks' => $processedTasks,
                'total_tasks' => $totalTasks,
                'has_failures' => !is_null($failed_uuid)
            ]);

        } catch (\Exception $e) {
            Log::error('=== SCRAPING DATA JOB FAILED ===', [
                'error' => $e->getMessage(),
                'file' => $e->getFile(),
                'line' => $e->getLine(),
                'processed_tasks' => $processedTasks,
                'total_tasks' => $totalTasks,
                'failed_uuid' => $failed_uuid,
                'trace' => $e->getTraceAsString()
            ]);

            // Update ImportDatasource with failure info
            if ($import_datasource) {
                $import_datasource->update([
                    'status' => 'failed',
                    'message' => "Scraping failed: {$e->getMessage()}. Processed {$processedTasks}/{$totalTasks} tasks.",
                    'response_body' => 'Scraping process interrupted due to error',
                    'finish_time' => now(),
                    'failed_uuid' => $failed_uuid,
                ]);
            }

            // Don't retry this job
            $this->fail($e);
        }
    }

    /**
     * Process all detail endpoints for a single PBG task
     */
    private function processTaskDetails(ServiceTabPbgTask $service, string $uuid): void
    {
        // Call all detail scraping methods for this task
        $service->scraping_task_details($uuid);
        $service->scraping_pbg_data_list($uuid);
        $service->scraping_task_retributions($uuid);
        $service->scraping_task_integrations($uuid);
        $service->scraping_task_detail_status($uuid);
    }

    /**
     * Determine if an error is critical enough to stop the entire process
     */
    private function isCriticalError(\Exception $e): bool
    {
        $criticalMessages = [
            'authentication failed',
            'token expired',
            'database connection',
            'memory exhausted',
            'maximum execution time'
        ];

        $errorMessage = strtolower($e->getMessage());

        foreach ($criticalMessages as $critical) {
            if (strpos($errorMessage, $critical) !== false) {
                return true;
            }
        }

        return false;
    }
}