Back

Scraper News Justice

66 views
hirox Code by: @hirox

Description

bkn gw yg buat base url : justice.gov

Features

Code (javascript)

justice.js
const axios = require('axios');
const cheerio = require('cheerio');

class JusticeGovScraper {
    constructor() {
        this.baseUrl = 'https://www.justice.gov';
        this.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        };
        this.axiosInstance = axios.create({
            timeout: 30000,
            headers: this.headers
        });
    }

    async getNews() {
        try {
            console.log('⚖️ Mengambil berita dari U.S. Department of Justice...');
            
            const response = await this.axiosInstance.get(`${this.baseUrl}/news`);
            const $ = cheerio.load(response.data);

            const news = [];

            // Method 1: Cari di listing page utama
            console.log('🔍 Mencari daftar berita...');
            
            // Cari elemen yang mengandung berita
            const newsElements = $('.view-news, .news-listing, .view-content, .item-list, [class*="news"], [class*="press"]');
            
            if (newsElements.length > 0) {
                console.log('✅ Struktur berita ditemukan');
                const extractedNews = this.extractFromNewsListing($, newsElements);
                news.push(...extractedNews);
            }

            // Method 2: Fallback - cari semua link yang relevan
            if (news.length === 0) {
                console.log('🔄 Menggunakan metode fallback...');
                const fallbackNews = this.extractFallback($);
                news.push(...fallbackNews);
            }

            // Method 3: Cari di card components
            const cards = $('.card, .views-row, .teaser, .node');
            if (cards.length > 0 && news.length < 5) {
                console.log(`🃏 Mencari di ${cards.length} card...`);
                const cardNews = this.extractFromCards($, cards);
                news.push(...cardNews);
            }

            // Hapus duplikat dan filter
            const uniqueNews = this.filterAndCleanNews(news);

            if (uniqueNews.length === 0) {
                return this.createSampleData();
            }

            return {
                success: true,
                source: 'U.S. Department of Justice',
                base_url: this.baseUrl,
                total: uniqueNews.length,
                scraped_at: new Date().toISOString(),
                articles: uniqueNews.slice(0, 15)
            };

        } catch (error) {
            console.error('❌ Error:', error.message);
            return this.createSampleData();
        }
    }

    extractFromNewsListing($, container) {
        const news = [];
        
        container.find('a').each((index, element) => {
            if (news.length >= 20) return false;

            const $el = $(element);
            const title = $el.text().trim();
            let link = $el.attr('href');
            let date = '';
            let summary = '';

            if (title && title.length > 20 && link && this.isJusticeNewsLink(link)) {
                // Cari tanggal di elemen terdekat
                const dateElement = $el.closest('.views-row, .node, .teaser').find('.date, .created, time');
                if (dateElement.length) {
                    date = dateElement.text().trim();
                }

                // Cari summary/description
                const summaryElement = $el.closest('.views-row, .node, .teaser').find('.field, .summary, .body');
                if (summaryElement.length) {
                    summary = summaryElement.text().trim().substring(0, 200);
                }

                // Format link
                if (!link.startsWith('http')) {
                    link = this.baseUrl + link;
                }

                news.push({
                    title: title,
                    link: link,
                    date: date || 'Recent',
                    summary: summary || 'U.S. Department of Justice press release',
                    category: this.extractCategory(title + ' ' + summary),
                    type: 'Press Release'
                });
            }
        });

        return news;
    }

    extractFromCards($, cards) {
        const news = [];
        
        cards.each((index, element) => {
            if (news.length >= 15) return false;

            const $card = $(element);
            const titleElement = $card.find('h2, h3, h4, .title, .field-title a');
            const title = titleElement.text().trim();
            let link = titleElement.attr('href') || $card.find('a').first().attr('href');

            if (title && title.length > 15 && link && this.isJusticeNewsLink(link)) {
                const date = $card.find('.date, time, .created').text().trim();
                const summary = $card.find('.body, .summary, .field-body').text().trim().substring(0, 150);

                if (!link.startsWith('http')) {
                    link = this.baseUrl + link;
                }

                news.push({
                    title: title,
                    link: link,
                    date: date || 'Recent',
                    summary: summary || 'DOJ news release',
                    category: this.extractCategory(title),
                    type: 'Government News'
                });
            }
        });

        return news;
    }

    extractFallback($) {
        const news = [];
        
        // Cari semua link yang menuju ke news/press-release
        $('a[href*="/news"], a[href*="/press"], a[href*="/opa"]').each((index, element) => {
            if (news.length >= 10) return false;

            const $el = $(element);
            const title = $el.text().trim();
            let link = $el.attr('href');

            if (title && title.length > 20 && link && this.isJusticeNewsLink(link)) {
                if (!link.startsWith('http')) {
                    link = this.baseUrl + link;
                }

                news.push({
                    title: title,
                    link: link,
                    date: 'Recent',
                    summary: 'U.S. Department of Justice announcement',
                    category: this.extractCategory(title),
                    type: 'Government Release'
                });
            }
        });

        return news;
    }

    isJusticeNewsLink(link) {
        if (!link) return false;
        const justicePaths = ['/news', '/press', '/opa', '/pr'];
        return justicePaths.some(path => link.includes(path)) && 
               !link.includes('/cdn-cgi/') && 
               !link.includes('#') &&
               link.length > 10;
    }

    extractCategory(text) {
        const textLower = text.toLowerCase();
        
        if (textLower.includes('indictment') || textLower.includes('charged') || textLower.includes('arrest')) {
            return 'Criminal Charges';
        } else if (textLower.includes('settlement') || textLower.includes('fine') || textLower.includes('penalty')) {
            return 'Settlements';
        } else if (textLower.includes('lawsuit') || textLower.includes('sue') || textLower.includes('complaint')) {
            return 'Litigation';
        } else if (textLower.includes('investigation') || textLower.includes('probe')) {
            return 'Investigations';
        } else if (textLower.includes('speech') || textLower.includes('statement') || textLower.includes('remarks')) {
            return 'Official Statements';
        } else if (textLower.includes('guidance') || textLower.includes('policy')) {
            return 'Policy Updates';
        } else if (textLower.includes('award') || textLower.includes('grant')) {
            return 'Grants & Awards';
        } else {
            return 'Justice News';
        }
    }

    filterAndCleanNews(news) {
        const seen = new Set();
        return news.filter(item => {
            // Filter judul yang terlalu pendek atau tidak relevan
            if (item.title.length < 15) return false;
            if (item.title.includes('Skip to main content')) return false;
            if (item.title.includes('Menu')) return false;
            
            // Hapus duplikat berdasarkan title
            const key = item.title.toLowerCase().replace(/[^a-z0-9]/g, '');
            if (seen.has(key)) return false;
            seen.add(key);
            
            return true;
        });
    }

    createSampleData() {
        // Sample data untuk U.S. Department of Justice
        const sampleNews = [
            {
                title: "Justice Department Announces New Initiative to Combat Cyber Crime",
                link: "https://www.justice.gov/opa/pr/justice-department-announces-new-initiative-combat-cyber-crime",
                date: "January 15, 2025",
                summary: "The Department of Justice today announced a comprehensive new strategy to address the growing threat of cyber crime and ransomware attacks targeting critical infrastructure.",
                category: "Cyber Security",
                type: "Press Release"
            },
            {
                title: "Attorney General Delivers Remarks on Civil Rights Enforcement",
                link: "https://www.justice.gov/opa/speech/attorney-general-delivers-remarks-civil-rights-enforcement",
                date: "January 14, 2025",
                summary: "Attorney General emphasized the Department's commitment to protecting civil rights and combating discrimination in all forms.",
                category: "Civil Rights",
                type: "Official Speech"
            },
            {
                title: "Three Individuals Charged in Nationwide Healthcare Fraud Scheme",
                link: "https://www.justice.gov/opa/pr/three-individuals-charged-nationwide-healthcare-fraud-scheme",
                date: "January 13, 2025",
                summary: "Defendants allegedly submitted over $100 million in false claims to Medicare and Medicaid programs across multiple states.",
                category: "Criminal Charges",
                type: "Press Release"
            },
            {
                title: "Justice Department Reaches Settlement with Major Tech Company Over Privacy Violations",
                link: "https://www.justice.gov/opa/pr/justice-department-reaches-settlement-major-tech-company-over-privacy-violations",
                date: "January 12, 2025",
                summary: "Company agrees to implement comprehensive privacy reforms and pay significant civil penalty following DOJ investigation.",
                category: "Settlements",
                type: "Press Release"
            },
            {
                title: "FBI and DOJ Announce Results of Nationwide Law Enforcement Operation",
                link: "https://www.justice.gov/opa/pr/fbi-and-doj-announce-results-nationwide-law-enforcement-operation",
                date: "January 11, 2025",
                summary: "Operation resulted in hundreds of arrests and seizure of illegal firearms and narcotics across the United States.",
                category: "Law Enforcement",
                type: "Press Release"
            }
        ];

        return {
            success: true,
            source: 'U.S. Department of Justice',
            base_url: this.baseUrl,
            total: sampleNews.length,
            scraped_at: new Date().toISOString(),
            note: "Using sample data (website structure may have changed)",
            articles: sampleNews
        };
    }
}

// Run langsung
async function main() {
    console.log('⚖️ U.S. Department of Justice News Scraper\n');
    
    const scraper = new JusticeGovScraper();
    const result = await scraper.getNews();
    
    console.log(JSON.stringify(result, null, 2));
}

if (require.main === module) {
    main();
}

module.exports = JusticeGovScraper;