Modify TheMovieDB scraper to get additional MPAA Rating info from IMDB
#1
I am attempting to modify TheMovieDB scraper TMDB.xml to pickup additional MPAA rating information from IMDB (i.e. Rated R "for excessive violence"). I can get the "Rated R" portion from the certification tag on TMDB, but when I try and append the additional information from IMDB using a custom function, nothing shows up. I have beat my head against this for a couple of days so I thought I would ask for some help since documentation of scraper development is scarce at best. I have tested in ScraperXMLEdition using the movie "2012", and everything seems to be working correctly. Certification is read from TMDB and additional info is read from IMDB and appended. It just doesn't show up in XBMC as "MPAA Rating" as it should. What is it that I am doing wrong? Any help would be appreciated.

Note: I am using Camelot as Dharma is randomly crashing on me.

Here is my full tmdb.xml file:
Code:
<?xml version="1.0" encoding="utf-8"?><scraper framework="1.1" date="2011-01-20" name="themoviedb.org" content="movies" thumb="tmdb.png" language="en">
    <include>common/tmdb.xml</include>
    <include>common/dtrailer.xml</include>
    <GetSettings dest="3">
        <RegExp input="$$5" output="&lt;settings&gt;\1&lt;/settings&gt;" dest="3">
            <RegExp input="$$1" output="&lt;setting label=&quot;Enable Fanart&quot; type=&quot;bool&quot; id=&quot;fanart&quot; default=&quot;true&quot;&gt;&lt;/setting&gt;" dest="5+">
                <expression />
            </RegExp>
            <RegExp input="$$1" output="&lt;setting label=&quot;Enable Trailers from Dtrailer.com&quot; type=&quot;bool&quot; id=&quot;dtrailer&quot; default=&quot;false&quot;&gt;&lt;/setting&gt;" dest="5+">
                <expression />
            </RegExp>
            <RegExp input="$$1" output="&lt;setting label=&quot;Enable additional MPAA Ratings info from IMDB.com&quot; type=&quot;bool&quot; id=&quot;imdbmpaa&quot; default=&quot;true&quot;&gt;&lt;/setting&gt;" dest="5+">
                <expression />
            </RegExp>
            <expression noclean="1" />
        </RegExp>
    </GetSettings>
    <CreateSearchUrl dest="3">
        <RegExp input="$$1" output="&lt;url&gt;http://api.themoviedb.org/2.1/Movie.search/en/xml/57983e31fb435df4df77afb854740ea9/\1&lt;/url&gt;" dest="3">
            <RegExp input="$$2" output="%20(\1)" dest="4">
                <expression clear="yes">(.+)</expression>
            </RegExp>
            <expression noclean="1" />
        </RegExp>
    </CreateSearchUrl>
    <NfoUrl dest="3">
        <RegExp input="$$1" output="&lt;url&gt;http://api.themoviedb.org/2.1/Movie.getInfo/en/xml/57983e31fb435df4df77afb854740ea9/\2&lt;/url&gt;&lt;id&gt;\2&lt;/id&gt;" dest="3">
            <expression clear="yes" noclean="1">(themoviedb.org/movie/)([0-9]*)</expression>
        </RegExp>
        <RegExp input="$$1" output="&lt;url function=&quot;GetTMDBId&quot;&gt;http://api.themoviedb.org/2.0/Movie.imdbLookup?imdb_id=tt\2&amp;amp;api_key=57983e31fb435df4df77afb854740ea9&lt;/url&gt;" dest="3+">
            <expression>(imdb.com/title/tt)([0-9]*)</expression>
        </RegExp>
        <RegExp input="$$1" output="&lt;url function=&quot;GetTMDBId&quot;&gt;http://api.themoviedb.org/2.0/Movie.imdbLookup?imdb_id=tt\2&amp;amp;api_key=57983e31fb435df4df77afb854740ea9&lt;/url&gt;" dest="3+">
            <expression>(imdb.com/)Title\?([0-9]+)</expression>
        </RegExp>
    </NfoUrl>
    <GetTMDBId dest="3">
        <RegExp input="$$1" output="&lt;url cache=&quot;tmdb-\1.xml&quot;&gt;http://api.themoviedb.org/2.1/Movie.getInfo/en/xml/57983e31fb435df4df77afb854740ea9/\1&lt;/url&gt;&lt;id&gt;\1&lt;/id&gt;" dest="3+">
            <expression>&lt;id&gt;([0-9]*)&lt;/id&gt;</expression>
        </RegExp>
    </GetTMDBId>
    <GetSearchResults dest="8">
        <RegExp input="$$3" output="&lt;results&gt;\1&lt;/results&gt;" dest="8">
            <RegExp input="$$1" output="&lt;entity&gt;&lt;title&gt;\1&lt;/title&gt;&lt;id&gt;\2&lt;/id&gt;&lt;url cache=&quot;tmdb-\2.xml&quot;&gt;http://api.themoviedb.org/2.1/Movie.getInfo/en/xml/57983e31fb435df4df77afb854740ea9/\2&lt;/url&gt;&lt;/entity&gt;" dest="3">
                <expression repeat="yes">&lt;movie&gt;.*?&lt;name&gt;([^&lt;]*)&lt;/name&gt;.*?&lt;id&gt;([^&lt;]*)&lt;/id&gt;.*?&lt;/movie&gt;</expression>
            </RegExp>
            <expression noclean="1" />
        </RegExp>
    </GetSearchResults>
    <GetDetails clearbuffers="no" dest="3">
        <RegExp input="$$5" output="&lt;details&gt;\1&lt;/details&gt;" dest="3">
            <RegExp input="$$1" output="\1" dest="6">
                <expression noclean="1"></expression>
            </RegExp>

            <RegExp input="$$1" output="&lt;title&gt;\1&lt;/title&gt;" dest="5">
                <expression>&lt;name&gt;([^&lt;]*)&lt;/name&gt;</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;year&gt;\1&lt;/year&gt;" dest="5+">
                <expression>&lt;released&gt;([0-9]+)-</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;runtime&gt;\1&lt;/runtime&gt;" dest="5+">
                <expression>&lt;runtime&gt;([^&lt;]*)&lt;/runtime&gt;</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;rating&gt;\1&lt;/rating&gt;" dest="5+">
                <expression>&lt;rating&gt;([^&lt;]*)&lt;/rating&gt;</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;genre&gt;\1&lt;/genre&gt;" dest="5+">
                <expression repeat="yes">&lt;category type="genre".*?name="([^"]*)"</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;studio&gt;\1&lt;/studio&gt;" dest="5+">
                <expression repeat="yes">&lt;studio.*?name="([^"]*)"</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;plot&gt;\1&lt;/plot&gt;" dest="5+">
                <expression>&lt;overview&gt;([^&lt;]*)&lt;/overview&gt;</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;overview&gt;\1&lt;/overview&gt;" dest="5+">
                <expression>&lt;overview&gt;([^&lt;]*)&lt;/overview&gt;</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;director&gt;\1&lt;/director&gt;" dest="5+">
                <expression repeat="yes">&lt;person.*?name="([^"]*)" character="" job="Director"</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;credits&gt;\1&lt;/credits&gt;" dest="5+">
                <expression repeat="yes">&lt;person.*?name="([^"]*)" character="" job="Author"</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;actor&gt;&lt;name&gt;\1&lt;/name&gt;&lt;role&gt;\2&lt;/role&gt;&lt;/actor&gt;" dest="5+">
                <expression repeat="yes">&lt;person.*?name="([^"]*)" character="([^"]*)" job="Actor"</expression>
            </RegExp>
            <RegExp input="$$1" output="&lt;actor&gt;&lt;name&gt;\1&lt;/name&gt;&lt;role&gt;\2&lt;/role&gt;&lt;thumb&gt;http://\4\5&lt;/thumb&gt;&lt;/actor&gt;" dest="5+">
                <expression repeat="yes">&lt;person.*?name="([^"]*)" character="([^"]*)" job="Actor" id="([0-9]+)" thumb="http://(.*?)_thumb([^"]*)"</expression>
            </RegExp>
            
            <!--Get the MPAA info from IMDB.com if the IMDBmpaa setting is set to true-->
            <RegExp input="$$1" output="&lt;url cache=&quot;\1-mpaa.html&quot; function=&quot;GetIMDBMPAAInfo&quot;&gt;http://www.imdb.com/title/\1&lt;/url&gt;" dest="5+">
                <expression noclean="1">&lt;imdb_id&gt;([^&lt;]*)&lt;/imdb_id&gt;</expression>
            </RegExp>

            <RegExp input="$$1" output="&lt;url cache=&quot;tmdb-$$2.xml&quot; function=&quot;GetTMDBThumbsById&quot;&gt;$$3&lt;/url&gt;" dest="5+">
                <expression />
            </RegExp>
            <RegExp conditional="fanart" input="$$1" output="&lt;url cache=&quot;tmdb-$$2.xml&quot; function=&quot;GetTMDBFanartById&quot;&gt;$$3&lt;/url&gt;" dest="5+">
                <expression />
            </RegExp>
            <RegExp conditional="dtrailer" input="$$6" output="&lt;url function=&quot;GetDTrailerLink&quot;&gt;http://en.dtrailer.com/movies/search/\1&lt;/url&gt;" dest="5+">
                <RegExp input="$$4" output="\1-" dest="6">
                    <RegExp input="$$1" output="\1" dest="4">
                        <expression>&lt;name&gt;([^&lt;]*)&lt;/name&gt;</expression>
                    </RegExp>
                    <expression repeat="yes">([a-zA-Z0-9]+)</expression>
                </RegExp>
                <expression />
            </RegExp>
            <expression noclean="1" />
        </RegExp>
    </GetDetails>
    <GetIMDBMPAAInfo clearbuffers="no" dest="8">
        <RegExp input="$$6" output="&lt;details&gt;&lt;mpaa&gt;Rated \1 " dest="8">
            <expression>&lt;certification&gt;([^&lt;]*)&lt;/certification&gt;</expression>            
        </RegExp>
        <RegExp input="$$1" output="\1&lt;/mpaa&gt;&lt;/details&gt;" dest="8+">
            <expression>MPAA&lt;/a&gt;\)&lt;/h4&gt;\nRated [PGRNC\-137]{1,5} ([^\.*]*)</expression>
        </RegExp>
    </GetIMDBMPAAInfo>
</scraper>

The specific portions I have added in GetDetails. First, copy the TMDB page into buffer 6 for use in my custom function. Then call custom function to get MPAA rating + additional information:
Code:
        <RegExp input="$$5" output="&lt;details&gt;\1&lt;/details&gt;" dest="3">
            <RegExp input="$$1" output="\1" dest="6">
                <expression noclean="1"></expression>
            </RegExp>
...
            <!--Get the MPAA info from IMDB.com if the IMDBmpaa setting is set to true-->
            <RegExp input="$$1" output="&lt;url cache=&quot;\1-mpaa.html&quot; function=&quot;GetIMDBMPAAInfo&quot;&gt;http://www.imdb.com/title/\1&lt;/url&gt;" dest="5+">
                <expression noclean="1">&lt;imdb_id&gt;([^&lt;]*)&lt;/imdb_id&gt;</expression>
            </RegExp>

Then my custom function. Build up something like "<details><mpaa>Rated R for excessive violence</mpaa></details>":
Code:
    <GetIMDBMPAAInfo clearbuffers="no" dest="8">
        <RegExp input="$$6" output="&lt;details&gt;&lt;mpaa&gt;Rated \1 " dest="8">
            <expression>&lt;certification&gt;([^&lt;]*)&lt;/certification&gt;</expression>            
        </RegExp>
        <RegExp input="$$1" output="\1&lt;/mpaa&gt;&lt;/details&gt;" dest="8+">
            <expression>MPAA&lt;/a&gt;\)&lt;/h4&gt;\nRated [PGRNC\-137]{1,5} ([^\.*]*)</expression>
        </RegExp>
    </GetIMDBMPAAInfo>
Reply
#2
Hi Hideous... I know this post has been a while... but have you been able to resolve this... for Frodo... I am bummed that nothing scrapes this info. I really would like it to read something like:

"Rated R for bloody violence and language"

Thanks,

H.
Image
Reply

Logout Mark Read Team Forum Stats Members Help
Modify TheMovieDB scraper to get additional MPAA Rating info from IMDB0