Example complex function node to get and parse HTML email - using OutdoorActive weekly recommended routes

Hi all,

Here is an example of using a function node with some added Node.js modules to read emails via IMAP, sanitise and process the HTML content and extract details to a retained global variable.

I use an Android app called Outdoor Active which is great for planning and tracking walks and cycles. You can create your own routes or use other people's. Part of the subscription is a weekly set of 3 recommended routes from your area which come in as a rather complex HTML email.

I want to build up a database of these routes for future reference and having them in emails is not very convenient. Hence this flow.

Even if you don't use Outdoor Active, it may give you some clues as to how about doing something similar with different emails.

[{"id":"50221a3d80da3544","type":"inject","z":"ba86b64531fa3971","name":"template","props":[],"repeat":"","crontab":"","once":false,"onceDelay":0.1,"topic":"","x":375,"y":1480,"wires":[["57f6db7d4fa4757a"]],"l":false},{"id":"57f6db7d4fa4757a","type":"change","z":"ba86b64531fa3971","name":"Delete outdoorActive entries","rules":[{"t":"delete","p":"#:(file)::outdoorActive.undefined","pt":"global"}],"action":"","property":"","from":"","to":"","reg":false,"x":540,"y":1480,"wires":[[]]},{"id":"101d59a819ed3c07","type":"group","z":"ba86b64531fa3971","name":"Get OutdoorActive recommended weekly routes from email INBOX and save to global.outdooractive","style":{"fill":"#bfdbef","fill-opacity":"0.33","label":true,"color":"#000000"},"nodes":["f13d75bdcbb47b42","e331a282ae599a4c","bff1f71b7b7d53d4"],"x":134,"y":1359,"w":692,"h":82},{"id":"f13d75bdcbb47b42","type":"function","z":"ba86b64531fa3971","g":"101d59a819ed3c07","name":"Get OutdoorActive recommended routes from email","func":"const outdoorActive = global.get('outdoorActive', 'file') ?? {}\n\n// https://www.npmjs.com/package/imapflow\n// https://imapflow.com/module-imapflow-ImapFlow.html\nconst { ImapFlow } = imapflow\n\n// https://nodemailer.com/extras/mailparser/\nconst { simpleParser } = mailparser\n\n// https://www.npmjs.com/package/sanitize-html\nconst sanitizeOpts = {\n    allowedTags: [\n        \"address\", \"article\", \"aside\", \"footer\", \"header\", \"h1\", \"h2\", \"h3\", \"h4\",\n        \"h5\", \"h6\", \"hgroup\", \"main\", \"nav\", \"section\", \"blockquote\", \"dd\", \"div\",\n        \"dl\", \"dt\", \"figcaption\", \"figure\", \"hr\", \"li\", \"main\", \"ol\", \"p\", \"pre\",\n        \"ul\", \"a\", \"abbr\", \"b\", \"bdi\", \"bdo\", \"br\", \"cite\", \"code\", \"data\", \"dfn\",\n        \"em\", \"i\", \"kbd\", \"mark\", \"q\", \"rb\", \"rp\", \"rt\", \"rtc\", \"ruby\", \"s\", \"samp\",\n        \"small\", \"span\", \"strong\", \"sub\", \"sup\", \"time\", \"u\", \"var\", \"wbr\", \"caption\",\n        \"col\", \"colgroup\", \"table\", \"tbody\", \"td\", \"tfoot\", \"th\", \"thead\", \"tr\",\n        \"img\", \"html\", \"head\", \"body\",\n        // \"link\", \"meta\",\n    ],\n    allowedAttributes: {\n        '*': [\n            'id',\n            // 'class',\n        ],\n        a: [ 'href', 'name', 'target' ],\n        img: [ 'src', 'srcset', 'alt', 'title', 'width', 'height', 'loading' ],\n        // meta: [ '*' ],\n        // link: [ 'href', 'type' ],\n    },\n}\n\nconst client = new ImapFlow({\n    host: msg.host,\n    port: 993,\n    secure: true,\n    auth: {\n        user: msg.user,\n        pass: msg.pw,\n    },\n    logger: false,\n    emitLogs: true,\n})\n\nconst main = async () => {\n    // Wait until client connects and authorizes\n    try {\n        await client.connect()\n        node.status({fill:\"green\",shape:\"dot\",text:\"Connected\"})\n    } catch (e) {\n        node.status({fill:\"red\",shape:\"ring\",text:\"Failed to connect\"})\n        msg.payload = e\n        node.error(`Could not connect to mail server. ${e.message}`, msg)\n        node.send(msg)\n        return\n    }\n\n    // These are all optional\n    // msg.serverInfo = client.serverInfo\n    // msg.capabilities = client.capabilities\n    // msg.mailboxes = await client.listTree()\n    // msg.inboxStatus = await client.status('INBOX', { recent: true, unseen: true, messages: true, })\n\n    // What properties to return for each email\n    const fetchOptions = {\n        // bodyStructure: true,\n        // envelope: true,\n        // flags: true,\n        // headers: true,\n        // headers: [ 'date', 'subject', 'to', 'from', 'X-Spam-Score', 'X-CampaignID', ],\n        // labels: true,\n        // size: true,\n        source: true,\n        // threadId: true,\n        // uid: true,\n        // bodyParts: [\n        //     'text',\n        // ],\n    }\n    // What to search for\n    const search = {\n        // seen: false,\n        // flagged: true,\n        from: 'for-you@news.outdooractive.com',\n    }\n\n    // Select and lock a mailbox. Throws if mailbox does not exist\n    let mb = await client.getMailboxLock(msg.mailbox)\n    node.status({fill:\"blue\",shape:\"ring\",text:\"INBOX open\"})\n    // client.mailbox includes information about currently selected mailbox\n    msg.inbox = client.mailbox\n\n    try {\n        for await ( let message of client.fetch(search, fetchOptions, { uid: true }) ) {\n            // msg.message = message\n            const mail = await simpleParser(message.source)\n            // msg.mail = mail\n            const received = mail.date\n\n            const $ = cheerio.load(sanitizeHtml(mail.html, sanitizeOpts))\n            // const data = $('body > div:nth-child(2) > div:nth-child(1) > table > tbody > tr > td > div:nth-child(5) > table > tbody > tr > td > div')\n            const entries = $('body > div:nth-child(2) > div:nth-child(1) > table > tbody > tr > td > div:nth-child(5) > table > tbody > tr > td > div').find('div')\n\n            // debugger\n            entries.each( (i, entry) => {\n                const d = $(entry).find('table > tbody > tr > td > table > tbody')\n\n                // Title/Who: body > div:nth-child(1) > table > tbody > tr > td > table > tbody > tr:nth-child(1) > td > table > tbody > tr:nth-child(1) > td > a\n                const d1 = $(d).find('tr:nth-child(1) > td > table > tbody > tr:nth-child(1) > td > a')\n                let u\n                let routeId\n                const title = {}\n                if (d1[0]?.attribs?.href) {\n                    u = new url.URL(d1[0].attribs.href)\n                    routeId = u.pathname.split('/').slice(-1)[0]\n                    title.href = `${u.origin}${u.pathname}`\n                    title.text = d1[0].firstChild.data\n                }\n                const d2 = $(d).find('tr:nth-child(1) > td > table > tbody > tr:nth-child(2) > td > a')\n                const who = {}\n                if (d2[0]?.attribs?.href) {\n                    u = new url.URL(d2[0].attribs.href)\n                    who.href = `${u.origin}${u.pathname}`\n                    who.text = d2[0].firstChild.data\n                }\n\n                // Image: body > div:nth-child(1) > table > tbody > tr > td > table > tbody > tr:nth-child(2) > td > table > tbody > tr > td > a\n                const d3 = $(d).find('tr:nth-child(2) > td > table > tbody > tr > td > a > img')\n                const image = {}\n                if (d3[0]?.attribs?.src) {\n                    image.src = d3[0].attribs.src\n                    image.width = d3[0].attribs.width\n                    image.height = d3[0].attribs.height\n                }\n\n                // Meta: body > div:nth-child(1) > table > tbody > tr > td > table > tbody > tr:nth-child(3) > td > table > tbody > tr:nth-child(2)\n                const d4 = $(d).find('tr:nth-child(3) > td > table > tbody > tr:nth-child(2) > td')\n                const details = {\n                    distance: d4[0]?.firstChild?.data,\n                    time: d4[1]?.firstChild?.data,\n                    up: d4[2]?.firstChild?.data,\n                    down: d4[3]?.firstChild?.data,\n                }\n\n                // console.log({i, title, who, image, details, d1, d2, d3, d4})\n\n                outdoorActive[routeId] = {title, who, image, details, received}\n                // node.send({payload: entry.html()})\n            })\n\n            msg.payload = outdoorActive\n\n            node.send(msg)\n        }\n    } finally {\n        // Make sure lock is released, otherwise next `getMailboxLock()` never returns\n        mb.release()\n        node.status({fill:\"grey\",shape:\"dot\",text:\"INBOX Lock Released\"})\n    }\n\n    // log out and close connection\n    await client.logout()\n    node.status({fill:\"grey\",shape:\"ring\",text:\"Logged out\"})\n}\n\nmain().catch(err => node.error(err))\n\nglobal.set('outdoorActive', outdoorActive, 'file')\n\n// return msg","outputs":1,"timeout":0,"noerr":0,"initialize":"","finalize":"","libs":[{"var":"imapflow","module":"imapflow"},{"var":"sanitizeHtml","module":"sanitize-html"},{"var":"mailparser","module":"mailparser"},{"var":"cheerio","module":"cheerio"},{"var":"url","module":"url"}],"x":530,"y":1400,"wires":[["e331a282ae599a4c"]]},{"id":"e331a282ae599a4c","type":"debug","z":"ba86b64531fa3971","g":"101d59a819ed3c07","name":"debug 451","active":true,"tosidebar":true,"console":false,"tostatus":true,"complete":"true","targetType":"full","statusVal":"","statusType":"counter","x":765,"y":1400,"wires":[],"l":false},{"id":"bff1f71b7b7d53d4","type":"inject","z":"ba86b64531fa3971","g":"101d59a819ed3c07","name":"template","props":[{"p":"host","v":"","vt":"str"},{"p":"user","v":"","vt":"str"},{"p":"pw","v":"","vt":"str"},{"p":"mailbox","v":"INBOX","vt":"str"}],"repeat":"","crontab":"","once":false,"onceDelay":0.1,"topic":"","x":240,"y":1400,"wires":[["f13d75bdcbb47b42"]]}]

image

  • imapflow - is an IMAP client. It connects to your mail server and searches for the appropriate emails. It can also set flags if you want to make processed emails "read" and could even move processed emails to an archive folder.

  • mailparser - is used to make sense of the content of each email since imapflow isn't perfect at that.

  • sanitize-html - cuts out the unnecessary styling and Microsoft rubbish from the email's HTML.

  • cheerio - is used to walk through the HTML to get to the required data.

  • url - is a node.js built-in library and is used for removing the marketing references from the URL's.