<tr> <th scope="row" style="text-align:center;"><a href="/wiki/Poirot%27s_Early_Cases#The_Adventure_of_the_Clapham_Cook" title="Poirot's Early Cases">The Adventure of the Clapham Cook</a></th> <td style="text-align:center;">8 January 1989</td> <tr> <th scope="row" style="text-align:center;"><a href="/wiki/Murder_in_the_Mews_(TV_1989)" class="mw-redirect" title="Murder in the Mews (TV 1989)">Murder in the Mews</a></th> <td style="text-align:center;">15 January 1989</td>I want the text from the <a> tag that resides inside the <th> tag.
I found a Javascript library named cheerio that seemed to do what I require.
$ npm install request cheerio
// index.js
let request = require('request');
let cheerio = require('cheerio');
let lastEpisode = false;
request({
uri: "https://en.wikipedia.org/wiki/List_of_Agatha_Christie's_Poirot_episodes"
}, function (err, response, html) {
if (err || response.statusCode !== 200) {
console.log('E R R O R ' + response.statusCode);
return;
}
var $ = cheerio.load(html);
$('th a').each(function(i, elem) {
let episode = $(this).text();
// eliminate "[fn" (footnotes)
if (!lastEpisode && episode.substring(0, 3) !== '[fn') {
console.log(episode);
}
lastEpisode = lastEpisode || (episode.substring(0, 7) === "Curtain");
});
});
$ node index.js The Adventure of the Clapham Cook Murder in the Mews The Adventure of Johnnie Waverly Four and Twenty Blackbirds The Third Floor Flat Triangle at Rhodes Problem at Sea The Incredible Theft The King of Clubs The Dream Peril at End House The Veiled Lady The Lost Mine The Cornish Mystery The Disappearance of Mr. Davenheim Double Sin The Adventure of the Cheap Flat . . . Cat Among the Pigeons Third Girl Appointment with Death Three Act Tragedy Hallowe'en Party Murder on the Orient Express The Clocks Elephants Can Remember The Big Four Dead Man's Folly The Labours of Hercules Curtain: Poirot's Last Case