bkn gw yg buat base url : justice.gov
const axios = require('axios');
const cheerio = require('cheerio');
class JusticeGovScraper {
constructor() {
this.baseUrl = 'https://www.justice.gov';
this.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
};
this.axiosInstance = axios.create({
timeout: 30000,
headers: this.headers
});
}
async getNews() {
try {
console.log('⚖️ Mengambil berita dari U.S. Department of Justice...');
const response = await this.axiosInstance.get(`${this.baseUrl}/news`);
const $ = cheerio.load(response.data);
const news = [];
// Method 1: Cari di listing page utama
console.log('🔍 Mencari daftar berita...');
// Cari elemen yang mengandung berita
const newsElements = $('.view-news, .news-listing, .view-content, .item-list, [class*="news"], [class*="press"]');
if (newsElements.length > 0) {
console.log('✅ Struktur berita ditemukan');
const extractedNews = this.extractFromNewsListing($, newsElements);
news.push(...extractedNews);
}
// Method 2: Fallback - cari semua link yang relevan
if (news.length === 0) {
console.log('🔄 Menggunakan metode fallback...');
const fallbackNews = this.extractFallback($);
news.push(...fallbackNews);
}
// Method 3: Cari di card components
const cards = $('.card, .views-row, .teaser, .node');
if (cards.length > 0 && news.length < 5) {
console.log(`🃏 Mencari di ${cards.length} card...`);
const cardNews = this.extractFromCards($, cards);
news.push(...cardNews);
}
// Hapus duplikat dan filter
const uniqueNews = this.filterAndCleanNews(news);
if (uniqueNews.length === 0) {
return this.createSampleData();
}
return {
success: true,
source: 'U.S. Department of Justice',
base_url: this.baseUrl,
total: uniqueNews.length,
scraped_at: new Date().toISOString(),
articles: uniqueNews.slice(0, 15)
};
} catch (error) {
console.error('❌ Error:', error.message);
return this.createSampleData();
}
}
extractFromNewsListing($, container) {
const news = [];
container.find('a').each((index, element) => {
if (news.length >= 20) return false;
const $el = $(element);
const title = $el.text().trim();
let link = $el.attr('href');
let date = '';
let summary = '';
if (title && title.length > 20 && link && this.isJusticeNewsLink(link)) {
// Cari tanggal di elemen terdekat
const dateElement = $el.closest('.views-row, .node, .teaser').find('.date, .created, time');
if (dateElement.length) {
date = dateElement.text().trim();
}
// Cari summary/description
const summaryElement = $el.closest('.views-row, .node, .teaser').find('.field, .summary, .body');
if (summaryElement.length) {
summary = summaryElement.text().trim().substring(0, 200);
}
// Format link
if (!link.startsWith('http')) {
link = this.baseUrl + link;
}
news.push({
title: title,
link: link,
date: date || 'Recent',
summary: summary || 'U.S. Department of Justice press release',
category: this.extractCategory(title + ' ' + summary),
type: 'Press Release'
});
}
});
return news;
}
extractFromCards($, cards) {
const news = [];
cards.each((index, element) => {
if (news.length >= 15) return false;
const $card = $(element);
const titleElement = $card.find('h2, h3, h4, .title, .field-title a');
const title = titleElement.text().trim();
let link = titleElement.attr('href') || $card.find('a').first().attr('href');
if (title && title.length > 15 && link && this.isJusticeNewsLink(link)) {
const date = $card.find('.date, time, .created').text().trim();
const summary = $card.find('.body, .summary, .field-body').text().trim().substring(0, 150);
if (!link.startsWith('http')) {
link = this.baseUrl + link;
}
news.push({
title: title,
link: link,
date: date || 'Recent',
summary: summary || 'DOJ news release',
category: this.extractCategory(title),
type: 'Government News'
});
}
});
return news;
}
extractFallback($) {
const news = [];
// Cari semua link yang menuju ke news/press-release
$('a[href*="/news"], a[href*="/press"], a[href*="/opa"]').each((index, element) => {
if (news.length >= 10) return false;
const $el = $(element);
const title = $el.text().trim();
let link = $el.attr('href');
if (title && title.length > 20 && link && this.isJusticeNewsLink(link)) {
if (!link.startsWith('http')) {
link = this.baseUrl + link;
}
news.push({
title: title,
link: link,
date: 'Recent',
summary: 'U.S. Department of Justice announcement',
category: this.extractCategory(title),
type: 'Government Release'
});
}
});
return news;
}
isJusticeNewsLink(link) {
if (!link) return false;
const justicePaths = ['/news', '/press', '/opa', '/pr'];
return justicePaths.some(path => link.includes(path)) &&
!link.includes('/cdn-cgi/') &&
!link.includes('#') &&
link.length > 10;
}
extractCategory(text) {
const textLower = text.toLowerCase();
if (textLower.includes('indictment') || textLower.includes('charged') || textLower.includes('arrest')) {
return 'Criminal Charges';
} else if (textLower.includes('settlement') || textLower.includes('fine') || textLower.includes('penalty')) {
return 'Settlements';
} else if (textLower.includes('lawsuit') || textLower.includes('sue') || textLower.includes('complaint')) {
return 'Litigation';
} else if (textLower.includes('investigation') || textLower.includes('probe')) {
return 'Investigations';
} else if (textLower.includes('speech') || textLower.includes('statement') || textLower.includes('remarks')) {
return 'Official Statements';
} else if (textLower.includes('guidance') || textLower.includes('policy')) {
return 'Policy Updates';
} else if (textLower.includes('award') || textLower.includes('grant')) {
return 'Grants & Awards';
} else {
return 'Justice News';
}
}
filterAndCleanNews(news) {
const seen = new Set();
return news.filter(item => {
// Filter judul yang terlalu pendek atau tidak relevan
if (item.title.length < 15) return false;
if (item.title.includes('Skip to main content')) return false;
if (item.title.includes('Menu')) return false;
// Hapus duplikat berdasarkan title
const key = item.title.toLowerCase().replace(/[^a-z0-9]/g, '');
if (seen.has(key)) return false;
seen.add(key);
return true;
});
}
createSampleData() {
// Sample data untuk U.S. Department of Justice
const sampleNews = [
{
title: "Justice Department Announces New Initiative to Combat Cyber Crime",
link: "https://www.justice.gov/opa/pr/justice-department-announces-new-initiative-combat-cyber-crime",
date: "January 15, 2025",
summary: "The Department of Justice today announced a comprehensive new strategy to address the growing threat of cyber crime and ransomware attacks targeting critical infrastructure.",
category: "Cyber Security",
type: "Press Release"
},
{
title: "Attorney General Delivers Remarks on Civil Rights Enforcement",
link: "https://www.justice.gov/opa/speech/attorney-general-delivers-remarks-civil-rights-enforcement",
date: "January 14, 2025",
summary: "Attorney General emphasized the Department's commitment to protecting civil rights and combating discrimination in all forms.",
category: "Civil Rights",
type: "Official Speech"
},
{
title: "Three Individuals Charged in Nationwide Healthcare Fraud Scheme",
link: "https://www.justice.gov/opa/pr/three-individuals-charged-nationwide-healthcare-fraud-scheme",
date: "January 13, 2025",
summary: "Defendants allegedly submitted over $100 million in false claims to Medicare and Medicaid programs across multiple states.",
category: "Criminal Charges",
type: "Press Release"
},
{
title: "Justice Department Reaches Settlement with Major Tech Company Over Privacy Violations",
link: "https://www.justice.gov/opa/pr/justice-department-reaches-settlement-major-tech-company-over-privacy-violations",
date: "January 12, 2025",
summary: "Company agrees to implement comprehensive privacy reforms and pay significant civil penalty following DOJ investigation.",
category: "Settlements",
type: "Press Release"
},
{
title: "FBI and DOJ Announce Results of Nationwide Law Enforcement Operation",
link: "https://www.justice.gov/opa/pr/fbi-and-doj-announce-results-nationwide-law-enforcement-operation",
date: "January 11, 2025",
summary: "Operation resulted in hundreds of arrests and seizure of illegal firearms and narcotics across the United States.",
category: "Law Enforcement",
type: "Press Release"
}
];
return {
success: true,
source: 'U.S. Department of Justice',
base_url: this.baseUrl,
total: sampleNews.length,
scraped_at: new Date().toISOString(),
note: "Using sample data (website structure may have changed)",
articles: sampleNews
};
}
}
// Run langsung
async function main() {
console.log('⚖️ U.S. Department of Justice News Scraper\n');
const scraper = new JusticeGovScraper();
const result = await scraper.getNews();
console.log(JSON.stringify(result, null, 2));
}
if (require.main === module) {
main();
}
module.exports = JusticeGovScraper;