jroakes
12/13/2023 - 12:08 AM

lock Extract Title Terms from Google

// Edit me ///////////////////////////////////////////////
let search = "Google * and SEO";
// Edit me ///////////////////////////////////////////////
 
let searchQ = search.replace(' ', '+').trim();
let searchUrl = `https://www.google.com/search?q=%22${searchQ}%22&num=1000`;
let stopwordsUrl = "https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%20list%20of%20english%20stopwords";
let stopWords = new Set(); // Define stopWords in an accessible scope
 
// Fetch stop words
fetch(stopwordsUrl)
  .then(response => {
    if (!response.ok) throw new Error('Network response was not ok');
    return response.text();
  })
  .then(stopwordsData => {
    stopwordsData.split(/\n/).forEach(word => stopWords.add(word.trim()));
    return fetch(searchUrl); // Proceed to fetch the search results
  })
  .then(response => {
    if (!response.ok) throw new Error('Network response was not ok');
    return response.text();
  })
  .then(data => {
    let _htmlDoc = new DOMParser().parseFromString(data, "text/html");
    const bottomPercentile = 0.98;
 
    // Process and filter h3 text content
    let processedTexts = Array.from(_htmlDoc.querySelectorAll('h3')).map(h3 => 
      h3.textContent.trim().toLowerCase().replace(/[^\w\s]|_/g, "")
      .split(/\s+/).filter(word => !stopWords.has(word))
    );
 
    // Count word frequency
    let wordCounts = processedTexts.flatMap(words => words).reduce((acc, word) => {
        acc[word] = (acc[word] || 0) + 1;
        return acc;
    }, {});
 
    // Determine the frequency threshold
    let sortedCounts = Object.values(wordCounts).sort((a, b) => a - b);
    let thresholdIndex = Math.floor(sortedCounts.length * bottomPercentile);
    let thresholdValue = sortedCounts[thresholdIndex];
 
    // Filter out frequent words
    let frequentWords = new Set(Object.keys(wordCounts).filter(word => wordCounts[word] > thresholdValue));
 
    // Reconstruct text without frequent words
    let reconstructedText = new Set(processedTexts
      .map(words => words.filter(word => !frequentWords.has(word)).join(' '))
      .filter(text => text.split(' ').length > 1));
 
    reconstructedText.forEach(text => console.log(text));
  })
  .catch(error => console.error('Fetch error:', error));